CA4 (Machine Learning)
Training different classifiers on a dataset and test models.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from copy import deepcopy
from IPython.display import display, HTML
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from mlxtend.evaluate import bias_variance_decomp
from dataclasses import dataclass
from xgboost import XGBClassifier
TARGET_COLUMN = 'NumPurchases'
df = pd.read_csv('marketing_campaign.csv')
# delete the first column as it is just a normal counting number.
# df = df.drop(df.columns[0], axis=1)
pd.set_option("display.max_columns", None)
df.head(10)
| Unnamed: 0 | ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | Complain | NumPurchases | UsedCampaignOffer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 04-09-2012 | 58 | 635.0 | 88 | 546 | 172 | 88 | 88.0 | NaN | 0 | 25 | 1 |
| 1 | 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 08-03-2014 | 38 | NaN | 1 | 6 | 2 | 1 | 6.0 | 5.0 | 0 | 6 | 0 |
| 2 | 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | NaN | 49 | 127 | 111 | 21 | 42.0 | NaN | 0 | 21 | 0 |
| 3 | 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10-02-2014 | 26 | 11.0 | 4 | 20 | 10 | 3 | 5.0 | 6.0 | 0 | 8 | 0 |
| 4 | 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173.0 | 43 | 118 | 46 | 27 | 15.0 | 5.0 | 0 | 19 | 0 |
| 5 | 5 | 7446 | 1967 | Master | Together | NaN | 0 | 1 | 09-09-2013 | 16 | 520.0 | 42 | 98 | 0 | 42 | 14.0 | NaN | 0 | 22 | 0 |
| 6 | 6 | 965 | 1971 | Graduation | Divorced | 55635.0 | 0 | 1 | 13-11-2012 | 34 | 235.0 | 65 | 164 | 50 | 49 | 27.0 | 6.0 | 0 | 21 | 0 |
| 7 | 7 | 6177 | 1985 | PhD | Married | 33454.0 | 1 | 0 | 08-05-2013 | 32 | 76.0 | 10 | 56 | 3 | 1 | 23.0 | 8.0 | 0 | 10 | 0 |
| 8 | 8 | 4855 | 1974 | PhD | Together | 30351.0 | 1 | 0 | 06-06-2013 | 19 | 14.0 | 0 | 24 | 3 | 3 | 2.0 | 9.0 | 0 | 6 | 1 |
| 9 | 9 | 5899 | 1950 | PhD | Together | 5648.0 | 1 | 1 | 13-03-2014 | 68 | 28.0 | 0 | 6 | 1 | 1 | 13.0 | 20.0 | 0 | 2 | 1 |
The
infomethod shows general info about our data frame, for example, its data and data types. As we can see we have 2240 entries in our data frame.
It includes 19 columns including its name, Non-Null count, and Dtype(data type).
Means how many rows have a value in a specific column.
At last, there is a printed count of each datatype, and data frames memory usage.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 2240 non-null int64 1 ID 2240 non-null int64 2 Year_Birth 2240 non-null int64 3 Education 2240 non-null object 4 Marital_Status 2240 non-null object 5 Income 2017 non-null float64 6 Kidhome 2240 non-null int64 7 Teenhome 2240 non-null int64 8 Dt_Customer 2240 non-null object 9 Recency 2240 non-null int64 10 MntCoffee 2035 non-null float64 11 MntFruits 2240 non-null int64 12 MntMeatProducts 2240 non-null int64 13 MntFishProducts 2240 non-null int64 14 MntSweetProducts 2240 non-null int64 15 MntGoldProds 2227 non-null float64 16 NumWebVisitsMonth 2040 non-null float64 17 Complain 2240 non-null int64 18 NumPurchases 2240 non-null int64 19 UsedCampaignOffer 2240 non-null int64 dtypes: float64(4), int64(13), object(3) memory usage: 350.1+ KB
The describe method shows some statistical information about our dataframe.
Each table row reports a property of the corresponding column's data:
df.describe()
| Unnamed: 0 | ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | Complain | NumPurchases | UsedCampaignOffer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2240.000000 | 2240.000000 | 2240.000000 | 2017.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2035.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2227.000000 | 2040.000000 | 2240.000000 | 2240.000000 | 2240.000000 |
| mean | 1119.500000 | 5592.159821 | 1968.805804 | 52297.080317 | 0.437946 | 0.506250 | 49.109375 | 304.239312 | 26.302232 | 166.950000 | 37.525446 | 27.062946 | 43.847777 | 5.326961 | 0.009375 | 14.862054 | 0.271875 |
| std | 646.776623 | 3246.662198 | 11.984069 | 25543.108215 | 0.563666 | 0.544538 | 28.962453 | 337.515534 | 39.773434 | 225.715373 | 54.628979 | 41.280498 | 51.897098 | 2.439349 | 0.096391 | 7.677173 | 0.445025 |
| min | 0.000000 | 0.000000 | 1893.000000 | 2447.000000 | -5.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 559.750000 | 2828.250000 | 1959.000000 | 35340.000000 | 0.000000 | 0.000000 | 24.000000 | 23.000000 | 1.000000 | 16.000000 | 3.000000 | 1.000000 | 9.000000 | 3.000000 | 0.000000 | 8.000000 | 0.000000 |
| 50% | 1119.500000 | 5458.500000 | 1970.000000 | 51369.000000 | 0.000000 | 0.000000 | 49.000000 | 177.000000 | 8.000000 | 67.000000 | 12.000000 | 8.000000 | 24.000000 | 6.000000 | 0.000000 | 15.000000 | 0.000000 |
| 75% | 1679.250000 | 8427.750000 | 1977.000000 | 68316.000000 | 1.000000 | 1.000000 | 74.000000 | 505.000000 | 33.000000 | 232.000000 | 50.000000 | 33.000000 | 56.000000 | 7.000000 | 0.000000 | 21.000000 | 1.000000 |
| max | 2239.000000 | 11191.000000 | 1996.000000 | 666666.000000 | 2.000000 | 2.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | 263.000000 | 362.000000 | 20.000000 | 1.000000 | 44.000000 | 1.000000 |
def missing_values(df: pd.DataFrame) -> pd.DataFrame:
nan_vals_count = df.isna().sum()
nan_vals_percent = nan_vals_count / len(df)
nan_values = pd.concat([nan_vals_count, nan_vals_percent], axis=1, keys=["Missing", "Percentage"])
return nan_values
missing_values(df)
| Missing | Percentage | |
|---|---|---|
| Unnamed: 0 | 0 | 0.000000 |
| ID | 0 | 0.000000 |
| Year_Birth | 0 | 0.000000 |
| Education | 0 | 0.000000 |
| Marital_Status | 0 | 0.000000 |
| Income | 223 | 0.099554 |
| Kidhome | 0 | 0.000000 |
| Teenhome | 0 | 0.000000 |
| Dt_Customer | 0 | 0.000000 |
| Recency | 0 | 0.000000 |
| MntCoffee | 205 | 0.091518 |
| MntFruits | 0 | 0.000000 |
| MntMeatProducts | 0 | 0.000000 |
| MntFishProducts | 0 | 0.000000 |
| MntSweetProducts | 0 | 0.000000 |
| MntGoldProds | 13 | 0.005804 |
| NumWebVisitsMonth | 200 | 0.089286 |
| Complain | 0 | 0.000000 |
| NumPurchases | 0 | 0.000000 |
| UsedCampaignOffer | 0 | 0.000000 |
missing_data = pd.DataFrame({
'Feature': df.columns,
'Missing Count': df.isnull().sum(),
'Missing Percentage': df.isnull().mean() * 100
})
missing_data
| Feature | Missing Count | Missing Percentage | |
|---|---|---|---|
| Unnamed: 0 | Unnamed: 0 | 0 | 0.000000 |
| ID | ID | 0 | 0.000000 |
| Year_Birth | Year_Birth | 0 | 0.000000 |
| Education | Education | 0 | 0.000000 |
| Marital_Status | Marital_Status | 0 | 0.000000 |
| Income | Income | 223 | 9.955357 |
| Kidhome | Kidhome | 0 | 0.000000 |
| Teenhome | Teenhome | 0 | 0.000000 |
| Dt_Customer | Dt_Customer | 0 | 0.000000 |
| Recency | Recency | 0 | 0.000000 |
| MntCoffee | MntCoffee | 205 | 9.151786 |
| MntFruits | MntFruits | 0 | 0.000000 |
| MntMeatProducts | MntMeatProducts | 0 | 0.000000 |
| MntFishProducts | MntFishProducts | 0 | 0.000000 |
| MntSweetProducts | MntSweetProducts | 0 | 0.000000 |
| MntGoldProds | MntGoldProds | 13 | 0.580357 |
| NumWebVisitsMonth | NumWebVisitsMonth | 200 | 8.928571 |
| Complain | Complain | 0 | 0.000000 |
| NumPurchases | NumPurchases | 0 | 0.000000 |
| UsedCampaignOffer | UsedCampaignOffer | 0 | 0.000000 |
$\rho_{XY} = \frac{\text{cov}(X, Y)}{\sigma_X \sigma_Y}$
$\text{cov}(X, Y) = \frac{\sum_{i=1}^{n}(X_i - \bar{X})(Y_i - \bar{Y})}{n-1}$
$\sigma = \sqrt{\frac{\sum_{i=1}^{n}(X_i - \bar{X})^2}{n-1}}$
def plot_correlation_heatmap(df):
numeric_df = df.select_dtypes(include=['number'])
plt.figure(figsize=(15, 15))
sns.heatmap(numeric_df.corr(), annot=True, fmt=".3f", cmap="Blues", linewidths=1, square=True)
plt.title('Correlation Matrix Heatmap')
plt.xticks(rotation=45, ha='right')
plt.show()
plot_correlation_heatmap(df)
here I consider the features with a correlation above 0.45
def select_features_by_correlation(df, target_column='NumPurchases', threshold_low=0.25, threshold_high=0.45):
numeric_df = df.select_dtypes(include=['number'])
target_corr = numeric_df.corr()[target_column].drop(target_column)
target_corr_filtered = target_corr[abs(target_corr) > threshold_low].sort_values(ascending=False)
selected_features = target_corr_filtered[abs(target_corr_filtered) > threshold_high].sort_values(ascending=False)
return selected_features
selected_features = select_features_by_correlation(df, target_column='NumPurchases')
selected_features
MntCoffee 0.715164 Income 0.562603 MntMeatProducts 0.554229 MntGoldProds 0.493939 MntSweetProducts 0.472876 MntFishProducts 0.469454 MntFruits 0.455461 Name: NumPurchases, dtype: float64
df[selected_features.index].nunique()
MntCoffee 747 Income 1810 MntMeatProducts 558 MntGoldProds 212 MntSweetProducts 177 MntFishProducts 182 MntFruits 158 dtype: int64
# visualize_feature_distribution
for feature in selected_features.index:
plt.hist(df[feature], edgecolor='white')
plt.ylabel(feature)
plt.show()
def explore_relationship(df, selected_features):
sns.set(rc={'figure.figsize': (10, 6)})
for feature in selected_features.index.to_list():
scatter_plot(df, feature, TARGET_COLUMN)
hexbin_plot(df, feature, TARGET_COLUMN)
def scatter_plot(df, feature, target_col=TARGET_COLUMN):
sns.scatterplot(x=feature, y=target_col, data=df)
plt.title(f'Scatter Plot: {feature} vs {target_col}')
plt.show()
def hexbin_plot(df, feature, target_col=TARGET_COLUMN):
sns.jointplot(x=feature, y=target_col, data=df, kind='hex', color='green')
plt.suptitle(f'Hexbin Plot: {feature} vs {target_col}', y=1.02)
plt.show()
Here, the scatter and hexbin diagrams are drawn for the chosen features in previous part.
With hexbin diagram, we can see the density of similar data.
explore_relationship(df, selected_features)
def box_plot(df, selected_features, target_col=TARGET_COLUMN):
for feature in selected_features.index.to_list():
sns.boxplot(x=target_col, y=feature, data=df)
plt.title(f'Box Plot: {feature} vs {target_col}')
plt.show()
sns.set(rc={'figure.figsize': (14, 6)})
box_plot(df, selected_features)
# sns.set(rc={'figure.figsize': (14, 6)})
# # correlation_heatmap(df)
# plot_correlation_heatmap(df)
among several ways for handling this problem, Imputation and Dropping are explained here:
Imputation
All the missing values are replaced by a substitution. Substitutions:
as the mean can represent the whole data, using the average to fill missing values is simple and proper. Median: As outlier data can affect the mean negatively, using a not-affected substitution like the median is proper.
In categorical data, in which the mean and median are not defined in them, Mode is preferred.
filling missing values with randomly selected values between the column's minimum and maximum data could lead us to a good result.
In this method, based on properties relating to the row, we predict the amount and with a good prediction, our total column is nearer to real properties.
Dropping
in this method, we can either drop columns or drop rows.
each column containing some missing value is dropped. As we lose some valuable values by dropping the whole column, this method should be applied in the case that the missing values of the column are the majority part of the whole data in that column. The reason is when the missing values are too many, there is less amount of reliable value to fill in missing values with good precision.
This method is similar to the previous one in some ways, but this is considerable in that if what a column has is all missing values, then by applying this method, all rows of the table are gone! So it is better to apply on the rows with a majority of missing values.
This is dataset:
missing_values(df)
| Missing | Percentage | |
|---|---|---|
| Unnamed: 0 | 0 | 0.000000 |
| ID | 0 | 0.000000 |
| Year_Birth | 0 | 0.000000 |
| Education | 0 | 0.000000 |
| Marital_Status | 0 | 0.000000 |
| Income | 223 | 0.099554 |
| Kidhome | 0 | 0.000000 |
| Teenhome | 0 | 0.000000 |
| Dt_Customer | 0 | 0.000000 |
| Recency | 0 | 0.000000 |
| MntCoffee | 205 | 0.091518 |
| MntFruits | 0 | 0.000000 |
| MntMeatProducts | 0 | 0.000000 |
| MntFishProducts | 0 | 0.000000 |
| MntSweetProducts | 0 | 0.000000 |
| MntGoldProds | 13 | 0.005804 |
| NumWebVisitsMonth | 200 | 0.089286 |
| Complain | 0 | 0.000000 |
| NumPurchases | 0 | 0.000000 |
| UsedCampaignOffer | 0 | 0.000000 |
First of all, i delete invalid values from columns, such as negative values for countable features like
pNegCols = ["UsedCampaignOffer", "NumPurchases", "Complain", "NumWebVisitsMonth", "MntGoldProds",
"MntSweetProducts", "MntFishProducts", "MntMeatProducts", "MntFruits", "MntCoffee",
"Recency", "Teenhome", "Kidhome", "Income", "Year_Birth", "ID"]
df[pNegCols] = np.where(df[pNegCols] < 0, np.nan, df[pNegCols])
missing_values(df)
| Missing | Percentage | |
|---|---|---|
| Unnamed: 0 | 0 | 0.000000 |
| ID | 0 | 0.000000 |
| Year_Birth | 0 | 0.000000 |
| Education | 0 | 0.000000 |
| Marital_Status | 0 | 0.000000 |
| Income | 223 | 0.099554 |
| Kidhome | 4 | 0.001786 |
| Teenhome | 0 | 0.000000 |
| Dt_Customer | 0 | 0.000000 |
| Recency | 0 | 0.000000 |
| MntCoffee | 205 | 0.091518 |
| MntFruits | 0 | 0.000000 |
| MntMeatProducts | 0 | 0.000000 |
| MntFishProducts | 0 | 0.000000 |
| MntSweetProducts | 0 | 0.000000 |
| MntGoldProds | 13 | 0.005804 |
| NumWebVisitsMonth | 200 | 0.089286 |
| Complain | 0 | 0.000000 |
| NumPurchases | 0 | 0.000000 |
| UsedCampaignOffer | 0 | 0.000000 |
As we can see the Kidhome column had some negative values and i set them to nan to fix it.
Respectively, Income, MntCoffee, and NumWebVisitsMonth, and then MntGoldProds have Missing values.
here i used median to fill in the missing values in data frame:
def fillna_with_median(df):
df.fillna(df.median(numeric_only=True), inplace=True)
return df
def fill_with_mode(df):
mode_values = df.mode(numeric_only=True).iloc[0]
df.fillna(mode_values, inplace=True)
# fill_with_mode(df)
fillna_with_median(df)
missing_values(df)
| Missing | Percentage | |
|---|---|---|
| Unnamed: 0 | 0 | 0.0 |
| ID | 0 | 0.0 |
| Year_Birth | 0 | 0.0 |
| Education | 0 | 0.0 |
| Marital_Status | 0 | 0.0 |
| Income | 0 | 0.0 |
| Kidhome | 0 | 0.0 |
| Teenhome | 0 | 0.0 |
| Dt_Customer | 0 | 0.0 |
| Recency | 0 | 0.0 |
| MntCoffee | 0 | 0.0 |
| MntFruits | 0 | 0.0 |
| MntMeatProducts | 0 | 0.0 |
| MntFishProducts | 0 | 0.0 |
| MntSweetProducts | 0 | 0.0 |
| MntGoldProds | 0 | 0.0 |
| NumWebVisitsMonth | 0 | 0.0 |
| Complain | 0 | 0.0 |
| NumPurchases | 0 | 0.0 |
| UsedCampaignOffer | 0 | 0.0 |
or we can simply delete the row containing more than two NaN values, and after deleting, use KNNImputer to fill the deleted rows.
These techniques aim to bring the values of different features onto a similar scale.
where $X$ is the original value of the feature, $Xmin$ is the minimum value in the feature, and $Xmax$ is the maximum value in the feature.
Normalization is useful when the features have different ranges, and algorithms like neural networks or k-nearest neighbors may perform better when the input features are within a consistent scale.
Standardization:
as the numerical features in this dataset have different scles, normalizing or standardizing is beneficial.
def standardization(df, exclude_cols: list = []):
numeric_cols = df.select_dtypes(include="number")
df[numeric_cols.columns] = StandardScaler().fit_transform(numeric_cols)
df[exclude_cols] = numeric_cols[exclude_cols]
return df
def normalization(df, exclude_cols: list = []):
numeric_cols = df.select_dtypes(include="number")
df[numeric_cols.columns] = MinMaxScaler().fit_transform(numeric_cols)
df[exclude_cols] = numeric_cols[exclude_cols]
return df
def plot_histogram(df):
df.hist(bins=20, figsize=(20,15))
plt.show()
plot_histogram(df)
df = normalization(df, ['NumPurchases'])
plot_histogram(df)
df = standardization(df, ['NumPurchases'])
plot_histogram(df)
df.describe()
| Unnamed: 0 | ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | Complain | NumPurchases | UsedCampaignOffer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2.240000e+03 |
| mean | -1.015061e-16 | -9.198991e-17 | 4.452787e-16 | -3.552714e-16 | 9.992007e-17 | -3.172066e-18 | 1.292617e-16 | -5.709718e-17 | -6.819941e-17 | -8.564578e-17 | -1.015061e-16 | 2.537653e-17 | 9.198991e-17 | 4.440892e-17 | -3.053113e-17 | 14.862054 | 3.806479e-17 |
| std | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 7.677173 | 1.000223e+00 |
| min | -1.731278e+00 | -1.722818e+00 | -6.326960e+00 | -2.053225e+00 | -8.237017e-01 | -9.298944e-01 | -1.696001e+00 | -9.038860e-01 | -6.614492e-01 | -7.398135e-01 | -6.870680e-01 | -6.557331e-01 | -8.449660e-01 | -2.306859e+00 | -9.728167e-02 | 0.000000 | -6.110569e-01 |
| 25% | -8.656389e-01 | -8.514982e-01 | -8.184192e-01 | -6.304878e-01 | -8.237017e-01 | -9.298944e-01 | -8.671566e-01 | -8.204773e-01 | -6.363012e-01 | -6.689119e-01 | -6.321399e-01 | -6.315032e-01 | -6.710752e-01 | -5.939679e-01 | -9.728167e-02 | 8.000000 | -6.110569e-01 |
| 50% | -9.616873e-17 | -4.117757e-02 | 9.967091e-02 | -3.448418e-02 | -8.237017e-01 | -9.298944e-01 | -3.777284e-03 | -3.570960e-01 | -4.602650e-01 | -4.429132e-01 | -4.673554e-01 | -4.618937e-01 | -3.812572e-01 | 2.624776e-01 | -9.728167e-02 | 15.000000 | -6.110569e-01 |
| 75% | 8.656389e-01 | 8.735813e-01 | 6.839101e-01 | 5.852568e-01 | 1.034397e+00 | 9.069340e-01 | 8.596020e-01 | 5.140609e-01 | 1.684356e-01 | 2.882592e-01 | 2.284015e-01 | 1.438543e-01 | 2.370211e-01 | 6.907003e-01 | -9.728167e-02 | 21.000000 | 1.636509e+00 |
| max | 1.731278e+00 | 1.724876e+00 | 2.269702e+00 | 2.535543e+01 | 2.892495e+00 | 2.743762e+00 | 1.722981e+00 | 3.708303e+00 | 4.343008e+00 | 6.904261e+00 | 4.055064e+00 | 5.716737e+00 | 6.149307e+00 | 6.257596e+00 | 1.027943e+01 | 44.000000 | 1.636509e+00 |
We should encode the categorical features.
One-Hot Encoding:
This method is useful when the categories don't have an order. It is the most useful method for the algorithms that use the distance between the data points, such as
KNN. For example, if we have a feature with 3 categories, we can encode them as follows:
Category 1: 1, 0, 0
Category 2: 0, 1, 0
Category 3: 0, 0, 1
Label Encoding:
This method is useful when the categories have an order. For example, if we have a feature with 3 categories, we can encode them as follows:
Category 1: 0
Category 2: 1
Category 3: 2
Binary Encoding:
This method is useful when the categories don't have an order. It is somehow similar to the One-Hot Encoding method. For example, if we have a feature with 3 categories, we can encode them as follows:
Category 1: 00
Category 2: 01
Category 3: 10
Frequency Encoding:
This method is useful when the categories don't have an order. For example, if we have a feature with 3 categories, we can encode them as follows:
Category 1: 0.5
Category 2: 0.25
Category 3: 0.25
Target Encoding:
This method is useful when the categories don't have an order. For example, if we have a feature with 3 categories, we can encode them as follows:
Category 1: 0.5
Category 2: 0.25
Category 3: 0.75
no, as all of the features are not categorical.
def handle_non_numeric_columns_label_encoding(df):
label_encoder = LabelEncoder()
for col in df.select_dtypes(exclude=['number']).columns:
df[col] = label_encoder.fit_transform(df[col])
return df
df = handle_non_numeric_columns_label_encoding(df)
df.describe()
| Unnamed: 0 | ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | Complain | NumPurchases | UsedCampaignOffer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2240.000000 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2.240000e+03 |
| mean | -1.015061e-16 | -9.198991e-17 | 4.452787e-16 | 2.393750 | 3.729911 | -3.552714e-16 | 9.992007e-17 | -3.172066e-18 | 327.875446 | 1.292617e-16 | -5.709718e-17 | -6.819941e-17 | -8.564578e-17 | -1.015061e-16 | 2.537653e-17 | 9.198991e-17 | 4.440892e-17 | -3.053113e-17 | 14.862054 | 3.806479e-17 |
| std | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.124797 | 1.076277 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 190.165575 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 7.677173 | 1.000223e+00 |
| min | -1.731278e+00 | -1.722818e+00 | -6.326960e+00 | 0.000000 | 0.000000 | -2.053225e+00 | -8.237017e-01 | -9.298944e-01 | 0.000000 | -1.696001e+00 | -9.038860e-01 | -6.614492e-01 | -7.398135e-01 | -6.870680e-01 | -6.557331e-01 | -8.449660e-01 | -2.306859e+00 | -9.728167e-02 | 0.000000 | -6.110569e-01 |
| 25% | -8.656389e-01 | -8.514982e-01 | -8.184192e-01 | 2.000000 | 3.000000 | -6.304878e-01 | -8.237017e-01 | -9.298944e-01 | 163.750000 | -8.671566e-01 | -8.204773e-01 | -6.363012e-01 | -6.689119e-01 | -6.321399e-01 | -6.315032e-01 | -6.710752e-01 | -5.939679e-01 | -9.728167e-02 | 8.000000 | -6.110569e-01 |
| 50% | -9.616873e-17 | -4.117757e-02 | 9.967091e-02 | 2.000000 | 4.000000 | -3.448418e-02 | -8.237017e-01 | -9.298944e-01 | 326.000000 | -3.777284e-03 | -3.570960e-01 | -4.602650e-01 | -4.429132e-01 | -4.673554e-01 | -4.618937e-01 | -3.812572e-01 | 2.624776e-01 | -9.728167e-02 | 15.000000 | -6.110569e-01 |
| 75% | 8.656389e-01 | 8.735813e-01 | 6.839101e-01 | 3.000000 | 5.000000 | 5.852568e-01 | 1.034397e+00 | 9.069340e-01 | 485.000000 | 8.596020e-01 | 5.140609e-01 | 1.684356e-01 | 2.882592e-01 | 2.284015e-01 | 1.438543e-01 | 2.370211e-01 | 6.907003e-01 | -9.728167e-02 | 21.000000 | 1.636509e+00 |
| max | 1.731278e+00 | 1.724876e+00 | 2.269702e+00 | 4.000000 | 7.000000 | 2.535543e+01 | 2.892495e+00 | 2.743762e+00 | 662.000000 | 1.722981e+00 | 3.708303e+00 | 4.343008e+00 | 6.904261e+00 | 4.055064e+00 | 5.716737e+00 | 6.149307e+00 | 6.257596e+00 | 1.027943e+01 | 44.000000 | 1.636509e+00 |
Columns having low correlation with target can be removed.
df.describe()
| Unnamed: 0 | ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | Complain | NumPurchases | UsedCampaignOffer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2240.000000 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2.240000e+03 |
| mean | -1.015061e-16 | -9.198991e-17 | 4.452787e-16 | 2.393750 | 3.729911 | -3.552714e-16 | 9.992007e-17 | -3.172066e-18 | 327.875446 | 1.292617e-16 | -5.709718e-17 | -6.819941e-17 | -8.564578e-17 | -1.015061e-16 | 2.537653e-17 | 9.198991e-17 | 4.440892e-17 | -3.053113e-17 | 14.862054 | 3.806479e-17 |
| std | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.124797 | 1.076277 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 190.165575 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 7.677173 | 1.000223e+00 |
| min | -1.731278e+00 | -1.722818e+00 | -6.326960e+00 | 0.000000 | 0.000000 | -2.053225e+00 | -8.237017e-01 | -9.298944e-01 | 0.000000 | -1.696001e+00 | -9.038860e-01 | -6.614492e-01 | -7.398135e-01 | -6.870680e-01 | -6.557331e-01 | -8.449660e-01 | -2.306859e+00 | -9.728167e-02 | 0.000000 | -6.110569e-01 |
| 25% | -8.656389e-01 | -8.514982e-01 | -8.184192e-01 | 2.000000 | 3.000000 | -6.304878e-01 | -8.237017e-01 | -9.298944e-01 | 163.750000 | -8.671566e-01 | -8.204773e-01 | -6.363012e-01 | -6.689119e-01 | -6.321399e-01 | -6.315032e-01 | -6.710752e-01 | -5.939679e-01 | -9.728167e-02 | 8.000000 | -6.110569e-01 |
| 50% | -9.616873e-17 | -4.117757e-02 | 9.967091e-02 | 2.000000 | 4.000000 | -3.448418e-02 | -8.237017e-01 | -9.298944e-01 | 326.000000 | -3.777284e-03 | -3.570960e-01 | -4.602650e-01 | -4.429132e-01 | -4.673554e-01 | -4.618937e-01 | -3.812572e-01 | 2.624776e-01 | -9.728167e-02 | 15.000000 | -6.110569e-01 |
| 75% | 8.656389e-01 | 8.735813e-01 | 6.839101e-01 | 3.000000 | 5.000000 | 5.852568e-01 | 1.034397e+00 | 9.069340e-01 | 485.000000 | 8.596020e-01 | 5.140609e-01 | 1.684356e-01 | 2.882592e-01 | 2.284015e-01 | 1.438543e-01 | 2.370211e-01 | 6.907003e-01 | -9.728167e-02 | 21.000000 | 1.636509e+00 |
| max | 1.731278e+00 | 1.724876e+00 | 2.269702e+00 | 4.000000 | 7.000000 | 2.535543e+01 | 2.892495e+00 | 2.743762e+00 | 662.000000 | 1.722981e+00 | 3.708303e+00 | 4.343008e+00 | 6.904261e+00 | 4.055064e+00 | 5.716737e+00 | 6.149307e+00 | 6.257596e+00 | 1.027943e+01 | 44.000000 | 1.636509e+00 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 2240 non-null float64 1 ID 2240 non-null float64 2 Year_Birth 2240 non-null float64 3 Education 2240 non-null int32 4 Marital_Status 2240 non-null int32 5 Income 2240 non-null float64 6 Kidhome 2240 non-null float64 7 Teenhome 2240 non-null float64 8 Dt_Customer 2240 non-null int32 9 Recency 2240 non-null float64 10 MntCoffee 2240 non-null float64 11 MntFruits 2240 non-null float64 12 MntMeatProducts 2240 non-null float64 13 MntFishProducts 2240 non-null float64 14 MntSweetProducts 2240 non-null float64 15 MntGoldProds 2240 non-null float64 16 NumWebVisitsMonth 2240 non-null float64 17 Complain 2240 non-null float64 18 NumPurchases 2240 non-null float64 19 UsedCampaignOffer 2240 non-null float64 dtypes: float64(17), int32(3) memory usage: 323.9 KB
plot_correlation_heatmap(df)
def remove_low_correlation_columns(df, threshold=0.2):
correlation_matrix = df.corr()
correlations_with_target = correlation_matrix['NumPurchases']
correlation_threshold = 0.2
low_correlation_features = correlations_with_target[abs(correlations_with_target) < correlation_threshold].index
df.drop(low_correlation_features, axis=1, inplace=True)
remove_low_correlation_columns(df)
plot_correlation_heatmap(df)
df.describe()
| Income | Kidhome | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | NumPurchases | UsedCampaignOffer | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2.240000e+03 |
| mean | -3.552714e-16 | 9.992007e-17 | -5.709718e-17 | -6.819941e-17 | -8.564578e-17 | -1.015061e-16 | 2.537653e-17 | 9.198991e-17 | 4.440892e-17 | 14.862054 | 3.806479e-17 |
| std | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 7.677173 | 1.000223e+00 |
| min | -2.053225e+00 | -8.237017e-01 | -9.038860e-01 | -6.614492e-01 | -7.398135e-01 | -6.870680e-01 | -6.557331e-01 | -8.449660e-01 | -2.306859e+00 | 0.000000 | -6.110569e-01 |
| 25% | -6.304878e-01 | -8.237017e-01 | -8.204773e-01 | -6.363012e-01 | -6.689119e-01 | -6.321399e-01 | -6.315032e-01 | -6.710752e-01 | -5.939679e-01 | 8.000000 | -6.110569e-01 |
| 50% | -3.448418e-02 | -8.237017e-01 | -3.570960e-01 | -4.602650e-01 | -4.429132e-01 | -4.673554e-01 | -4.618937e-01 | -3.812572e-01 | 2.624776e-01 | 15.000000 | -6.110569e-01 |
| 75% | 5.852568e-01 | 1.034397e+00 | 5.140609e-01 | 1.684356e-01 | 2.882592e-01 | 2.284015e-01 | 1.438543e-01 | 2.370211e-01 | 6.907003e-01 | 21.000000 | 1.636509e+00 |
| max | 2.535543e+01 | 2.892495e+00 | 3.708303e+00 | 4.343008e+00 | 6.904261e+00 | 4.055064e+00 | 5.716737e+00 | 6.149307e+00 | 6.257596e+00 | 44.000000 | 1.636509e+00 |
Some common percentages for splitting the dataset into train and test sets:
train and $10\%$ testtrain and $20\%$ testtrain and $30\%$ testtrain and $40\%$ testSome ways to split data into training and test sets:
Randomly split the dataset into train and test sets
This method is the most common. But it has a problem. If we split the dataset randomly, the train and test sets may not have the same distribution.
Split the dataset based on the time
This method is useful when we have a time series dataset. But it is not useful in this case.
Split the dataset based on the target
This method is useful when we have an imbalanced dataset.
some of the most commonly used methods for random dataset splitting are outlined below:
train_test_split method of the scikit-learn library
randn method of the numpy library
sample method of the pandas library
def split_data(df: pd.DataFrame, target_column: str = TARGET_COLUMN, train_percent: float = 0.8, random_state: int = 1):
data = df[df.columns.difference([target_column])]
outcome_data = df[target_column]
data_train, data_test, outcome_train, outcome_test = train_test_split(
data, outcome_data, train_size=train_percent, random_state=random_state)
return data_train, data_test, outcome_train, outcome_test
x_train, x_test, y_train, y_test = split_data(df)
I set the random_state as 42 to make the dataset reproducible. It is used as a seed for a random number generator.
$20\%$ of the data will be used for testing, and the remaining $80\%$ will be used for training.
A validation set is used to evaluate the performance of the model and prevent overfitting. It is used to test the trained model before using the testing data. We usually use this set to tune the hyperparameters of the model.
This method makes a pratition on dataset int ok equally sized golds. In this case, the model is trained and evaluated k times, each time using a different fold as the test set and the remaining folds as the training set. This process helps assess the model's performance across different subsets of the data.
- The dataset is divided into k equally sized folds.
- Each fold serves as a test set while the remaining k-1 folds are used for training.
- The model is trained on the training set (k-1 folds).
- The trained model is then evaluated on the test set (the remaining fold).
- Steps 1 and 2 are repeated k times, with each of the k folds used exactly once as the test set.
- The performance metrics (e.g., accuracy, precision, recall) from each iteration are averaged to obtain a more robust evaluation of the model.
K-fold cross-validation provides a more reliable estimate of model performance compared to a single train-test split. It helps ensure that the model's performance is consistent across different subsets of the dat
It is commonly used for hyperparameter tuning, helping to find the optimal set of hyperparameters that generalize well to different data samples.
Main form of simple linear regression function: $$f(x) = \alpha x + \beta$$
here we want to find the bias ($\alpha$) and slope($\beta$) by minimizing the derivation of the Residual Sum of Squares (RSS) function:
$$ RSS = \Sigma (y_i - (\hat{\beta} + \hat{\alpha} * x_i) )^2 $$
$$ \frac{\partial RSS}{\partial \beta} = \Sigma (-f(x_i) + \hat{\beta} + \hat{\alpha} * x_i) = 0$$ $$ \to \beta = \hat{y} - \hat{\alpha} \hat{x} \to (1)$$
$$ \frac{\partial RSS}{\partial \alpha} = \Sigma (-2 x_i y_i + 2 \hat{\beta} x_i + 2\hat{\alpha} x_i ^ 2) = 0 \to (2)$$
$$ (1) , (2) \to \hat{\alpha} = \frac{\Sigma{(x_i - \hat{x})(y_i - \hat{y})}}{\Sigma{(x_i - \hat{x})^2}} $$ $$ \hat{\beta} = y - \hat{a} x$$
Since the MntCoffee feature has the most correlation with the target column, it is the best practice to use it for Linear Regression of Order 1.
Based on the above formula, implement the function below to compute the parameters of a simple linear regression
$\beta$ = $\frac{\sum_{i=1}^{n} (x_iy_i) - \frac{1}{n}\sum_{i=1}^{n}x_i\sum_{i=1}^{n}y_i}{\sum_{i=1}^{n}x_i^2 - \frac{1}{n}(\sum_{i=1}^{n}x_i)^2}$
$\alpha$ = $\frac{1}{n}\sum_{i=1}^{n}y_i - \beta \frac{1}{n}\sum_{i=1}^{n}x_i$
Here, I used columns with the most correlations, but the focus is on MntCoffee
selected_features = select_features_by_correlation(df, TARGET_COLUMN, 0.25, 0.477)
display(selected_features)
MntCoffee 0.678082 MntMeatProducts 0.554229 Income 0.535685 MntGoldProds 0.490752 Name: NumPurchases, dtype: float64
selected_features.describe()
count 4.000000 mean 0.564687 std 0.080157 min 0.490752 25% 0.524452 50% 0.544957 75% 0.585192 max 0.678082 Name: NumPurchases, dtype: float64
def simple_linear_regression(input_feature, output):
sum_x = np.sum(input_feature)
sum_y = np.sum(output)
# sum_x = input_feature.sum()
# sum_y = output.sum()
sum_xy = np.sum(input_feature * output)
sum_xx = np.sum(input_feature * input_feature)
# sum_xy = (input_feature * output).sum()
# sum_xx = (input_feature * input_feature).sum()
slope = (sum_xy - (sum_x * sum_y) / len(input_feature)) / (sum_xx - (sum_x * sum_x) / len(input_feature))
intercept = sum_y / len(input_feature) - slope * sum_x / len(input_feature)
return (intercept, slope)
# alpha betta
Now complete this get_regression_predictions(...) function to predict the value of given data based on the calculated intercept and slope
$\hat{y}$ = $\beta_0$ + $\beta_1 x$
def get_regression_predictions(input_feature, bias, slope):
# betta alpha
predicted_values = bias + slope * input_feature
return predicted_values
Now that we have a model and can make predictions, let's evaluate our model using Root Mean Square Error (RMSE). RMSE is the square root of the mean of the squared differences between the residuals, and the residuals is just a fancy word for the difference between the predicted output and the true output.
Complete the following function to compute the RSME of a simple linear regression model given the input_feature, output, intercept and slope:
$RMSE$ = $\sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}$
def get_root_mean_square_error(predicted_values, outputs):
squared_diffs = [(predicted - actual)**2 for predicted, actual in zip(predicted_values, outputs)]
mean_squared_diff = sum(squared_diffs) / len(squared_diffs)
rmse = mean_squared_diff**0.5
return rmse
The RMSE has no bound, thus it becomes challenging to determine whether a particular RMSE value is considered good or bad without any reference point. Instead, we use R2 score. The R2 score is calculated by comparing the sum of the squared differences between the actual and predicted values of the dependent variable to the total sum of squared differences between the actual and mean values of the dependent variable. The R2 score is formulated as below:
$$R^2 = 1 - \frac{SSres}{SStot} = 1 - \frac{\sum_{i=1}^{n} (y_{i,true} - y_{i,pred})^2}{\sum_{i=1}^{n} (y_{i,true} - \bar{y}_{true})^2} $$
Complete the following function to calculate the R2 score of a given input_feature, output, bias, and slope:
def get_r2_score(predicted_values, outputs):
residuals = outputs - predicted_values
SSres = np.sum(residuals ** 2)
SStot = np.sum((outputs - np.mean(outputs)) ** 2)
R2_score = 1 - SSres / SStot
return R2_score
Now calculate the fitness of the model. Remember to provide explanation for the outputs in your code!
$\beta$ = $\frac{\sum_{i=1}^{n} (x_iy_i) - \frac{1}{n}\sum_{i=1}^{n}x_i\sum_{i=1}^{n}y_i}{\sum_{i=1}^{n}x_i^2 - \frac{1}{n}(\sum_{i=1}^{n}x_i)^2}$
$\alpha = \frac{1}{n}\sum_{i=1}^{n}y_i - \beta \frac{1}{n}\sum_{i=1}^{n}x_i$
$\hat{y}$ = $4\alpha + \beta x$
$RMSE$ = $\sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}$
$R2$ = $1 - \frac{\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}{\sum_{i=1}^{n}(y_i - \bar{y})^2}$
def split_data(df: pd.DataFrame, target_column: str = TARGET_COLUMN, train_percent: float = 0.8, random_state: int = 1):
data = df[df.columns.difference([target_column])]
outcome_data = df[target_column]
data_train, data_test, outcome_train, outcome_test = train_test_split(
data, outcome_data, train_size=train_percent, random_state=random_state)
return data_train, data_test, outcome_train, outcome_test
x_train, x_test, y_train, y_test = split_data(df)
def log_parameters(feature, RMSE, R2_score, intercept, slope):
print(f"Feature :{feature}")
print(f"RMSE :{RMSE:0.3f}")
print(f"R2 Score :{R2_score:0.3f}")
print(f"y = {intercept:0.3f} * x + {slope:0.3f}")
print("--------------------------------------------")
def plot_regression_line(x_test, y_test, feature, predicted_values):
plt.figure(figsize=(6, 4))
plt.scatter(x_test[feature], y_test, label="Actual Data")
plt.plot(x_test[feature], predicted_values, color='red', label="Regression Line")
plt.title(f"Regression Line for {feature}")
plt.xlabel(feature)
plt.ylabel(TARGET_COLUMN)
plt.legend()
plt.show()
for feature in selected_features.index:
intercept, slope = simple_linear_regression(x_train[feature], y_train)
predicted_values = get_regression_predictions(x_test[feature], intercept, slope)
plot_regression_line(x_test, y_test, feature, predicted_values)
RMSE = get_root_mean_square_error(predicted_values, y_test)
R2_score = get_r2_score(predicted_values, y_test)
log_parameters(feature, RMSE, R2_score, intercept, slope)
Feature :MntCoffee RMSE :5.502 R2 Score :0.439 y = 14.864 * x + 5.264 --------------------------------------------
Feature :MntMeatProducts RMSE :6.097 R2 Score :0.311 y = 14.889 * x + 4.238 --------------------------------------------
Feature :Income RMSE :5.720 R2 Score :0.393 y = 14.864 * x + 3.812 --------------------------------------------
Feature :MntGoldProds RMSE :6.257 R2 Score :0.274 y = 14.943 * x + 3.759 --------------------------------------------
Multiple regression is a statistical technique that aims to model the relationship between a dependent variable and two or more independent variables.
Multiple regression with n independent variables is expressed as follows:
$$f(x) = \beta _{0} + \beta_{1} x_{1} + \beta_{2} x_{2} + \beta_{3} x_{3} + \beta_{4} x_{4} + ... + \beta_{n} x_{n} + c $$
To optimize the model for accurate predictions, multiple regression commonly employs iterative algorithms such as gradient descent.
The main goal of the optimization process is to make our predictions as close as possible to the actual values. We measure the prediction error using a cost function, usually denoted as $J(\beta)$.
$$ J(\beta)= \frac {1}{2m} Σ_{i=0}^{m-1}(y_i - (\hat \beta _{0} + \hat \beta_{1} x_{1} + \hat \beta_{2} x_{2} + \hat \beta_{3} x_{3} + \hat \beta_{4} x_{4} + ... + \hat \beta_{n} x_{n}) )^2 $$
Gradient descent iteratively adjusts the coefficients $(\beta_i)$ to minimize the cost function. The update rule for each coefficient is:
$$\beta_{i} = \beta _ {i} - \alpha \frac {∂J(\beta)}{∂\beta_{i}}$$
$$ \frac {∂J(\beta)}{∂\beta_{i}} = \frac {1}{m}Σ_{j=0}^{m-1}(y_j - (\hat \beta _{0} + \hat \beta_{1} x_{j1} + \hat \beta_{2} x_{j2} + \hat \beta_{3} x_{j3} + \hat \beta_{4} x_{j4} + ... + \hat \beta_{n} x_{jn})) x_{ji} $$
Based on the formula above and np.dot() method, complete this function to compute the predictions for an entire matrix of features given the matrix, bias, and the weights. Provide an explanation of np.dot method and the reasoning behind using this method in your code:
def predict_output(feature_matrix, weights, bias):
predictions = np.dot(feature_matrix, weights)
predictions += bias
return predictions
As we saw, the cost function is the sum over the data points of the squared difference between an observed output and a predicted output.
Since the derivative of a sum is the sum of the derivatives, we can compute the derivative for a single data point and then sum over data points. We can write the squared difference between the observed output and predicted output for a single point as follows:
$$ (output - (const* w _{0} + [feature_1] * w_{1} + ...+ [feature_n] * w_{n} ))^2 $$
With n feautures and a const , So the derivative will be :
$$ 2 * (output - (const* w _{0} + [feature_1] * w_{1} + ...+ [feature_n] * w_{n} )) $$
The term inside the paranethesis is just the error (difference between prediction and output). So we can re-write this as:
$$2 * error*[feature_i] $$
That is, the derivative for the weight for feature i is the sum (over data points) of 2 times the product of the error and the feature itself. In the case of the constant then this is just twice the sum of the errors!
Recall that twice the sum of the product of two vectors is just twice the dot product of the two vectors. Therefore the derivative for the weight for feature_i is just two times the dot product between the values of feature_i and the current errors.
With this in mind, complete the following derivative function which computes the derivative of the weight given the value of the feature (over all data points) and the errors (over all data points).
def feature_derivative(errors, feature):
derivative = 2 * np.dot(errors, feature)
return derivative
Now we will write a function that performs a gradient descent. The basic premise is simple. Given a starting point we update the current weights by moving in the negative gradient direction. Recall that the gradient is the direction of increase and therefore the negative gradient is the direction of decrease and we're trying to minimize a cost function.
The amount by which we move in the negative gradient direction is called the 'step size'. We stop when we are 'sufficiently close' to the optimum. We define this by requiring that the magnitude (length) of the gradient vector to be smaller than a fixed 'tolerance'.
With this in mind, complete the following gradient descent function below using your derivative function above. For each step in the gradient descent we update the weight for each feature before computing our stopping criteria.
def regression_gradient_descent(feature_matrix, outputs, initial_weights, bias, step_size, tolerance):
weights = np.array(initial_weights)
converged = False
while not converged:
predictions = predict_output(feature_matrix, weights, bias)
errors = outputs - predictions
gradient = - feature_derivative(feature_matrix.T, errors)
weights -= step_size * gradient
bias_gradient = -2 * np.sum(errors)
bias -= step_size * bias_gradient
if np.linalg.norm(gradient) < tolerance:
converged = True
return weights, bias
def normalize_features(chosen_features, data_frame):
for feature in chosen_features:
data_frame.loc[:, feature] = (data_frame[feature] - data_frame[feature].mean()) / data_frame[feature].std()
return data_frame
def n_feature_regression(chosen_feature_matrix, target_matrix, keywords):
initial_weights = keywords['initial_weights']
step_size = keywords['step_size']
tolerance = keywords['tolerance']
bias = keywords['bias']
weights, bias = regression_gradient_descent(chosen_feature_matrix, target_matrix, initial_weights, bias, step_size,
tolerance)
return weights, bias
def get_weights_and_bias(chosen_features):
"""
Computes the weights and bias for a general n feature model.
:param chosen_features: list of features to perform multiple regression on
:return: chosen_feature_matrix, computed weights and bias via regression
"""
# ToDo
# Would selecting different initial weights make any difference?
# Explain your answer.
keywords = {
'initial_weights': np.array([.5]*len(chosen_features)),
'step_size': 1.e-4,
'tolerance': 1.e-10,
'bias': 0
}
chosen_feature_dataframe = x_train[chosen_features]
# ToDo
# Why are the features normalized?
# chosen_feature_dataframe = normalize_features(chosen_features, chosen_feature_dataframe)
chosen_feature_matrix = chosen_feature_dataframe.to_numpy()
target_column = y_train
target_matrix = target_column.to_numpy()
train_weights, bias = n_feature_regression(chosen_feature_matrix, target_matrix, keywords)
return chosen_feature_matrix, train_weights, bias
In this part, you should choose 2 features and implement multiple regression on them :
chosen_features = selected_features.index[:2]
chosen_feature_matrix, train_weights, bias = get_weights_and_bias(chosen_features)
#######ddllln1Do
# compute the predictions
predictions = predict_output(x_test[chosen_features], train_weights, bias)
#ToDo
# Calculate the R2 score and mean square error
# Explain the results
R2_score = get_r2_score(predictions, y_test)
RMSE = get_root_mean_square_error(predictions, y_test)
print("RMSE :", RMSE)
print("R2 Score :", R2_score)
print("--------------------------------------------")
RMSE : 5.374576615233149 R2 Score : 0.4643213845425165 --------------------------------------------
Now repeat the steps for 3 features
# ToDo
chosen_features = selected_features.index[:3]
chosen_feature_matrix, train_weights, bias = get_weights_and_bias(chosen_features)
predictions = predict_output(x_test[chosen_features], train_weights, bias)
R2_score = get_r2_score(predictions, y_test)
RMSE = get_root_mean_square_error(predictions, y_test)
print("RMSE :", RMSE)
print("R2 Score :", R2_score)
print("--------------------------------------------")
RMSE : 5.199870341008263 R2 Score : 0.49858095852511164 --------------------------------------------
Finally, repeat the steps for 5 features
Explain the differences in the results and the reasoning behind these variations.
# ToDo
# ToDo
chosen_features = selected_features.index[:5]
chosen_feature_matrix, train_weights, bias = get_weights_and_bias(chosen_features)
predictions = predict_output(x_test[chosen_features], train_weights, bias)
R2_score = get_r2_score(predictions, y_test)
RMSE = get_root_mean_square_error(predictions, y_test)
print("RMSE :", RMSE)
print("R2 Score :", R2_score)
print("--------------------------------------------")
RMSE : 5.0354824739760895 R2 Score : 0.529783385182038 --------------------------------------------
PurchaseRate based on NumPurchases¶df['PurchaseRate'] = np.where(df['NumPurchases'] > df['NumPurchases'].median(), 'HIGH', 'LOW')
median_num_purchases = df['NumPurchases'].median()
df.drop(columns=['NumPurchases'], inplace=True)
TARGET_COLUMN = 'PurchaseRate'
df
| Income | Kidhome | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | UsedCampaignOffer | PurchaseRate | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.244835 | -0.823702 | 1.057762 | 1.551577 | 1.679702 | 2.462147 | 1.476500 | 0.855299 | 0.262478 | 1.636509 | HIGH |
| 1 | -0.241838 | 1.034397 | -0.357096 | -0.636301 | -0.713225 | -0.650449 | -0.631503 | -0.729039 | -0.165745 | -0.611057 | LOW |
| 2 | 0.800874 | -0.823702 | -0.357096 | 0.570804 | -0.177032 | 1.345274 | -0.146905 | -0.033476 | 0.262478 | -0.611057 | HIGH |
| 3 | -1.054666 | 1.034397 | -0.869905 | -0.560857 | -0.651187 | -0.503974 | -0.583043 | -0.748360 | 0.262478 | -0.611057 | LOW |
| 4 | 0.251231 | 1.034397 | -0.369453 | 0.419916 | -0.216914 | 0.155164 | -0.001525 | -0.555148 | -0.165745 | -0.611057 | HIGH |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 0.372136 | -0.823702 | 1.286363 | 0.419916 | 0.066692 | 0.081926 | 2.203398 | 3.927370 | -0.165745 | -0.611057 | HIGH |
| 2236 | 0.487305 | 2.892495 | 0.350333 | -0.661449 | -0.606873 | -0.687068 | -0.655733 | -0.690396 | 0.690700 | 1.636509 | HIGH |
| 2237 | 0.197092 | -0.823702 | 1.901116 | 0.545656 | 0.221789 | -0.101168 | -0.364974 | -0.381257 | 0.262478 | 1.636509 | HIGH |
| 2238 | 0.703160 | -0.823702 | 0.418295 | 0.092992 | 0.208495 | 0.777683 | 0.071165 | 0.333627 | 0.262478 | -0.611057 | HIGH |
| 2239 | 0.027413 | 1.034397 | -0.644392 | -0.586005 | -0.469501 | -0.650449 | -0.631503 | -0.439221 | 0.690700 | 1.636509 | LOW |
2240 rows × 11 columns
df['PurchaseRate'].replace({'LOW': 0, 'HIGH': 1}, inplace=True)
df
| Income | Kidhome | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | UsedCampaignOffer | PurchaseRate | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.244835 | -0.823702 | 1.057762 | 1.551577 | 1.679702 | 2.462147 | 1.476500 | 0.855299 | 0.262478 | 1.636509 | 1 |
| 1 | -0.241838 | 1.034397 | -0.357096 | -0.636301 | -0.713225 | -0.650449 | -0.631503 | -0.729039 | -0.165745 | -0.611057 | 0 |
| 2 | 0.800874 | -0.823702 | -0.357096 | 0.570804 | -0.177032 | 1.345274 | -0.146905 | -0.033476 | 0.262478 | -0.611057 | 1 |
| 3 | -1.054666 | 1.034397 | -0.869905 | -0.560857 | -0.651187 | -0.503974 | -0.583043 | -0.748360 | 0.262478 | -0.611057 | 0 |
| 4 | 0.251231 | 1.034397 | -0.369453 | 0.419916 | -0.216914 | 0.155164 | -0.001525 | -0.555148 | -0.165745 | -0.611057 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 0.372136 | -0.823702 | 1.286363 | 0.419916 | 0.066692 | 0.081926 | 2.203398 | 3.927370 | -0.165745 | -0.611057 | 1 |
| 2236 | 0.487305 | 2.892495 | 0.350333 | -0.661449 | -0.606873 | -0.687068 | -0.655733 | -0.690396 | 0.690700 | 1.636509 | 1 |
| 2237 | 0.197092 | -0.823702 | 1.901116 | 0.545656 | 0.221789 | -0.101168 | -0.364974 | -0.381257 | 0.262478 | 1.636509 | 1 |
| 2238 | 0.703160 | -0.823702 | 0.418295 | 0.092992 | 0.208495 | 0.777683 | 0.071165 | 0.333627 | 0.262478 | -0.611057 | 1 |
| 2239 | 0.027413 | 1.034397 | -0.644392 | -0.586005 | -0.469501 | -0.650449 | -0.631503 | -0.439221 | 0.690700 | 1.636509 | 0 |
2240 rows × 11 columns
def split_data(dataframe: pd.DataFrame, outcome: str, train_percent: float = 0.7):
ddata = dataframe.drop(columns=[outcome])
odata = dataframe[outcome]
split_data = train_test_split(ddata, odata, train_size=train_percent, random_state=1)
dtrain, dtest, otrain, otest = split_data
return dtrain, dtest, otrain, otest
x_train, x_test, y_train, y_test = split_data(df, 'PurchaseRate')
class Classifier:
def __init__(self, model, x_train, y_train, x_test, y_test, params=None):
self.model = model
self.x_train = x_train
self.y_train = y_train
self.x_test = x_test
self.y_test = y_test
self.params = params if params else {}
self.set_params()
def set_params(self):
self.model_instance = self.model(**self.params)
self.model_instance.fit(self.x_train, self.y_train)
self.predictions = self.model_instance.predict(self.x_test)
def accuracy_test(self) -> float:
return metrics.accuracy_score(self.y_test, self.predictions)
def accuracy_train(self) -> float:
train_predict = self.model_instance.predict(self.x_train)
return metrics.accuracy_score(self.y_train, train_predict)
def confusion_matrix(self):
matrix = metrics.confusion_matrix(self.y_test, self.predictions)
matrix_disp = metrics.ConfusionMatrixDisplay(matrix)
matrix_disp.plot(cmap='Blues')
plt.grid(False)
plt.title(f'{self.model.__name__} Confusion Matrix')
plt.show()
def log_grid_result(self, grid):
print(f"- Best hyperparameters : {grid.best_params_}")
print(f"- Best model's train score (accuracy): {grid.best_score_:0.3f}")
def grid_search(self, search_params, scoring='accuracy') -> tuple[float, GridSearchCV]:
grid = GridSearchCV(self.model_instance, search_params, scoring=scoring)
grid.fit(self.x_train, self.y_train)
test_score = grid.score(self.x_test, self.y_test)
print(f"- model's test accuracy : {self.accuracy_test():0.3f}")
print(f"- Test Score(accuracy) : {test_score:0.3f}")
self.log_grid_result(grid)
print(f"- Best model's test score : {test_score:0.3f}")
# return test_score, grid
def predict(self, x):
return self.model.predict(x)
A decision tree is a versatile tool in machine learning, working well for sorting things into groups or guessing values. It chops data into smaller chunks based on different traits, trying to keep things similar within each chunk.
It starts by looking at all the data and picks a trait that splits it into two groups that are as much alike as possible for the thing we care about. It keeps doing this for each smaller group until it hits specific rules, like how detailed the tree can get or how many examples are in each group.
For sorting things, it tries to find traits that give the most useful info or reduce the messiness in the groups. For guessing values, it looks for traits that get our guesses as close as possible to the real answers.
Once the tree's made, it can guess things about new data by following its branches from start to finish. Where it ends tells us our prediction. For sorting things, it might go with the most common group, and for guessing numbers, it could be an average.
These trees are great because they're simple to understand and display visually. They handle different types of data, like categories or numbers. But sometimes, they can get too detailed or struggle if there's noisy or extra stuff in the data. People have ways to fix this, like simplifying the tree or using groups of trees together to make better decisions.
some hyper-parameters:
The maximum depth of the decision tree. A larger value of max_depth can capture more complex patterns in the data, but may also lead to overfitting. A smaller value of max_depth may lead to underfitting.
The minimum number of samples required to split an internal node.
The minimum number of samples required to be at a leaf node.
The function used to measure the quality of a split. The two options availabel are gini and entropy.
The strategy used to choose the split at each node. The two options here are random and best.
grid_s_params = {
"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": range(2, 9),
"min_samples_split": range(2, 9),
"min_samples_leaf": range(2, 9),
"random_state": [54],
}
dtree_model = Classifier(DecisionTreeClassifier, x_train, y_train, x_test, y_test)
dtree_model.grid_search(grid_s_params)
- model's test accuracy : 0.882
- Test Score(accuracy) : 0.914
- Best hyperparameters : {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 54, 'splitter': 'best'}
- Best model's train score (accuracy): 0.919
- Best model's test score : 0.914
KNN, short for K-Nearest Neighbors, is a straightforward and efficient method used for sorting things into categories or guessing values. It finds the closest K neighbors from the training data to a new item and decides its category or value based on either voting or averaging those neighbors' characteristics. KNN doesn't make any strict assumptions about how the data is spread out.
Choosing the right K value is crucial. If K is large, it can help smooth out any irregularities in the data but might make the model too simple. If K is small, it can capture intricate details but might overcomplicate the model.
some settings include:
This specifies how to find those nearest neighbors. The options include auto, ball_tree, kd_tree, and brute.
It's the number of neighbors considered when making a decision. Higher values reduce noise but might oversimplify, while lower values can catch complex details but risk overfitting.
It's the measure of distance used to find these neighbors in the data.
grid_s_params = {
"n_neighbors": range(2,20),
"metric": ["euclidean", "manhattan", "minkowski"]
}
knear_model = Classifier(KNeighborsClassifier, x_train, y_train, x_test, y_test)
knear_model.grid_search(grid_s_params)
- model's test accuracy : 0.887
- Test Score(accuracy) : 0.890
- Best hyperparameters : {'metric': 'euclidean', 'n_neighbors': 3}
- Best model's train score (accuracy): 0.913
- Best model's test score : 0.890
Logistic Regression serves as a supervised learning technique primarily employed for classification tasks. It operates by modeling the likelihood of a binary outcome (like 0 or 1) based on the input features. This model produces a probability score between 0 and 1, representing the chance of the binary outcome occurring. Logistic regression, being parametric, makes certain assumptions about the data's distribution. It's versatile, handling both categorical and numerical data.
The logistic regression equation is:
$$(P(y=1|X)) = \sigma(z) = \frac{1}{1 + e^{-z}}$$
where the z is: $$z = w_1x_1 + w_2x_2 + ... + w_nx_n + b$$ where each $w_i$ is the weight associated with the $i^{th}$ feature.
The model computes optimal coefficients that minimize the difference between predicted probabilities and actual labels in the training data. Predictions involve calculating the probability and applying a decision threshold.
some hyperparameters:
This parameter chooses the optimization algorithm for coefficient optimization. Common solvers include lbfgs, liblinear, newton-cg, sag, and saga.
Dictates L1 or L2 regularization to curb overfitting. Penalty picks between the two. L1 can aid feature selection by zeroing some coefficients, while L2 shrinks coefficients towards zero.
Governs the regularization strength. A lower C means stronger regularization, preventing overfitting but potentially causing underfitting.
grid_s_params = {
'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
"penalty": ["l2"],
"solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
}
logreg_model = Classifier(LogisticRegression, x_train, y_train, x_test, y_test)
logreg_model.grid_search(grid_s_params)
- model's test accuracy : 0.881
- Test Score(accuracy) : 0.879
- Best hyperparameters : {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
- Best model's train score (accuracy): 0.897
- Best model's test score : 0.879
dtree_model.confusion_matrix()
knear_model.confusion_matrix()
logreg_model.confusion_matrix()
dt_model = DecisionTreeClassifier(
max_depth=4, min_samples_split=8,
min_samples_leaf=2, random_state=54,
splitter='best', criterion='gini'
)
dt_model.fit(x_train, y_train)
y_pred_dt = dt_model.predict(x_test)
# K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=9, metric='euclidean')
# weights='uniform', algorithm='kd_tree')
knn_model.fit(x_train, y_train)
y_pred_knn = knn_model.predict(x_test)
# Logistic Regression Classifier
logreg_model = LogisticRegression(C=1, penalty='l2', solver='newton-cg')
# max_iter=2000, random_state=42)
logreg_model.fit(x_train, y_train)
y_pred_logreg = logreg_model.predict(x_test)
models = {'Decision Tree': (dt_model, y_pred_dt),
'KNN': (knn_model, y_pred_knn),
'Logistic Regression': (logreg_model, y_pred_logreg)}
def plot_model_evaluation(models, X_test, y_test):
fig, axes = plt.subplots(nrows=2, ncols=len(models), figsize=(20, 10))
plt.subplots_adjust(hspace=0.5)
for i, (model_name, (model, y_pred)) in enumerate(models.items()):
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Low', 'High']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, i], xticklabels=labels, yticklabels=labels)
axes[0, i].set_title(f'Confusion Matrix - {model_name}')
# Classification Report
classification_rep = classification_report(y_test, y_pred, output_dict=True)
sns.heatmap(pd.DataFrame(classification_rep).iloc[:-1, :].T, annot=True, cmap='Blues', ax=axes[1, i])
axes[1, i].set_title(f'Classification Report - {model_name}')
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
axes[1, i].text(0.5, -0.2, f'Accuracy: {accuracy:.2%}', horizontalalignment='center',
verticalalignment='center', transform=axes[1, i].transAxes)
plt.show()
plot_model_evaluation(models, x_test, y_test)
def hyper_param_comp(param_range, init_params, param):
test_accs = []
train_accs = []
for i in param_range:
init_params[param] = i
classifier = RandomForestClassifier(criterion=init_params['criterion'], max_depth=init_params['max_depth'], n_estimators=init_params['n_estimators'])
RF = Classifier(classifier, grid_search_params, X_train, y_train)
test_accs.append(RF.calc_accuracy(X_test, y_test))
train_accs.append(RF.train_accuracy)
plt.plot(param_range, test_accs, color='blue', label='test')
plt.plot(param_range, train_accs, color='red', label='train')
plt.xlabel(param)
plt.ylabel('Accuracy')
plt.legend(loc="lower right")
GridSearchCV is a technique used in machine learning to fine-tune hyperparameters effectively, aiming to find the best combination for a model. It involves specifying various hyperparameters and testing different combinations to determine the optimal set.
Key parameters include:
This represents the model under consideration for parameter tuning.
It's a dictionary or list of dictionaries that outlines the hyperparameters and their potential values to explore.
Denotes the cross-validation strategy, determining how the dataset is split into training and validation sets.
This metric evaluates and scores the model's performance against the validation set.
Specifies the number of CPU cores used for parallel processing. Using n_jobs=-1 utilizes all available CPU cores for faster computation.
Controls the level of detail in the output during the search process. Setting verbose=1 displays progress messages during the hyperparameter search.
dtree_grid_s_params = {
"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": range(2, 9),
"min_samples_split": range(2, 9),
"min_samples_leaf": range(2, 9),
"random_state": [54],
}
print(f'Decision Tree')
dtree_model.grid_search(dtree_grid_s_params)
knear_grid_s_params = {
"n_neighbors": range(2,20),
"metric": ["euclidean", "manhattan", "minkowski"]
}
print(f'K Nearest Neighbors')
grid_res = knear_model.grid_search(knear_grid_s_params)
# print(f'Logistic reg model')
# grid_s_params = {
# 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# "penalty": ["l2"],
# "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
# }
# grid_res = logreg_model.grid_search()
Decision Tree
- model's test accuracy : 0.882
- Test Score(accuracy) : 0.914
- Best hyperparameters : {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 54, 'splitter': 'best'}
- Best model's train score (accuracy): 0.919
- Best model's test score : 0.914
K Nearest Neighbors
- model's test accuracy : 0.887
- Test Score(accuracy) : 0.890
- Best hyperparameters : {'metric': 'euclidean', 'n_neighbors': 3}
- Best model's train score (accuracy): 0.913
- Best model's test score : 0.890
dtree_params = {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 8, 'min_samples_split': 2, 'random_state': 1}
new_dtree_classifier = Classifier(DecisionTreeClassifier, x_train, y_train, x_test, y_test, dtree_params)
print(f'Decision Tree test: {new_dtree_classifier.accuracy_test() * 100:.3f}%')
print(f'Decision Tree train: {new_dtree_classifier.accuracy_train() * 100:.3f}%')
Decision Tree test: 91.220% Decision Tree train: 92.921%
knear_params = {'n_neighbors': 9}
new_knear_classifier = Classifier(KNeighborsClassifier, x_train, y_train, x_test, y_test, knear_params)
print(f'knn test: {new_knear_classifier.accuracy_test() * 100:.3f}%')
print(f'knn train: {new_knear_classifier.accuracy_train() * 100:.3f}%')
knn test: 88.542% knn train: 92.283%
There is two important concepts:
Bias: Bias refers to our accuracy in capturing the intricacies of the training data. Higher precision leads to lower bias. It implies considering every feature, even noisy data. Lower bias results in reduced error detection in the training data. However, it's not necessarily positive since it might lead to training on incorrect, noisy data, ultimately reducing the classifier's ability to generalize.
Variance: Variance gauges the classifier's performance on new, unseen data. Lower variance implies better performance on testing data. It signifies the classifier's ability to generalize and accurately identify patterns in the testing dataset.
As much as we want low bias and variance at once, that is not usually the case because once we lower one, the other one increases.
Overfitting happens when a model becomes excessively intricate, fitting the training data too precisely. It starts capturing noise instead of the actual patterns in the data, leading to reduced performance when applied to new, unseen data. This problem arises due to several factors: an abundance of features, excessive complexity, or prolonged training. To spot overfitting, comparing the model's performance on training versus validation data is crucial. If the model excels on the training data but falters on the validation data, it's likely overfitting.
Underfitting occurs when a model is too basic to capture the underlying data patterns adequately. It results in poor performance not only on the training data but also on new, unseen data. Underfitting arises from having too few features, excessive simplicity, or insufficient training duration. Detecting underfitting involves assessing the model's performance on both the training and validation data. If the model struggles on both sets of data, it's likely underfitting.
To combat overfitting, strategies like regularization, early stopping, or simplifying the model can be employed. Regularization involves adding a penalty to the loss function to discourage overly large weights, while early stopping halts training when the model's performance on validation data plateaus.
To tackle underfitting, approaches such as adding more features, increasing model complexity, or extending training duration can be helpful. Yet, it's crucial to strike a balance between model complexity and available data. A more intricate model might demand more data to prevent overfitting from occurring.
print(f'Decision Tree: {dtree_model.accuracy_train() * 100:.3f}%')
print(f'K Nearest Neighbors: {knear_model.accuracy_train() * 100:.3f}%')
Decision Tree: 100.000% K Nearest Neighbors: 93.495%
Test previous Results:
Decision Tree: $90.923%$
K Nearest Neighbors: $88.095%$
Logistic Regression: $87.649%$
as we can see, the result on train data is a little better thant the result in test data but in overal they are st the same reange.
After changing the threshold for removing columns with low correlations, the result changes, but the total change isn't very significant.
plot_tree(dtree_model.model_instance, filled=True, feature_names=x_train.columns, class_names=['Low', 'High'])
[Text(0.26328633288344405, 0.9772727272727273, 'MntMeatProducts <= -0.458\ngini = 0.5\nsamples = 1568\nvalue = [800, 768]\nclass = Low'), Text(0.12624584717607973, 0.9318181818181818, 'MntCoffee <= 0.057\ngini = 0.176\nsamples = 769\nvalue = [694, 75]\nclass = Low'), Text(0.11738648947951273, 0.8863636363636364, 'MntGoldProds <= 0.111\ngini = 0.079\nsamples = 724\nvalue = [694, 30]\nclass = Low'), Text(0.0664451827242525, 0.8409090909090909, 'MntSweetProducts <= 0.943\ngini = 0.04\nsamples = 685\nvalue = [671, 14]\nclass = Low'), Text(0.05758582502768549, 0.7954545454545454, 'MntCoffee <= -0.334\ngini = 0.035\nsamples = 683\nvalue = [671, 12]\nclass = Low'), Text(0.026578073089700997, 0.75, 'MntCoffee <= -0.414\ngini = 0.015\nsamples = 654\nvalue = [649, 5]\nclass = Low'), Text(0.017718715393133997, 0.7045454545454546, 'gini = 0.0\nsamples = 580\nvalue = [580, 0]\nclass = Low'), Text(0.035437430786267994, 0.7045454545454546, 'Income <= 0.444\ngini = 0.126\nsamples = 74\nvalue = [69, 5]\nclass = Low'), Text(0.026578073089700997, 0.6590909090909091, 'MntMeatProducts <= -0.569\ngini = 0.104\nsamples = 73\nvalue = [69, 4]\nclass = Low'), Text(0.017718715393133997, 0.6136363636363636, 'gini = 0.0\nsamples = 58\nvalue = [58, 0]\nclass = Low'), Text(0.035437430786267994, 0.6136363636363636, 'MntMeatProducts <= -0.529\ngini = 0.391\nsamples = 15\nvalue = [11, 4]\nclass = Low'), Text(0.026578073089700997, 0.5681818181818182, 'MntGoldProds <= -0.062\ngini = 0.5\nsamples = 8\nvalue = [4, 4]\nclass = Low'), Text(0.017718715393133997, 0.5227272727272727, 'NumWebVisitsMonth <= 0.048\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = High'), Text(0.008859357696566999, 0.4772727272727273, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.026578073089700997, 0.4772727272727273, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.035437430786267994, 0.5227272727272727, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.044296788482835, 0.5681818181818182, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = Low'), Text(0.044296788482835, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.08859357696567, 0.75, 'Kidhome <= 0.105\ngini = 0.366\nsamples = 29\nvalue = [22, 7]\nclass = Low'), Text(0.07087486157253599, 0.7045454545454546, 'MntSweetProducts <= -0.353\ngini = 0.111\nsamples = 17\nvalue = [16, 1]\nclass = Low'), Text(0.06201550387596899, 0.6590909090909091, 'gini = 0.0\nsamples = 16\nvalue = [16, 0]\nclass = Low'), Text(0.07973421926910298, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.10631229235880399, 0.7045454545454546, 'MntSweetProducts <= -0.559\ngini = 0.5\nsamples = 12\nvalue = [6, 6]\nclass = Low'), Text(0.09745293466223699, 0.6590909090909091, 'MntMeatProducts <= -0.656\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = High'), Text(0.08859357696567, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.10631229235880399, 0.6136363636363636, 'MntGoldProds <= -0.574\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = High'), Text(0.09745293466223699, 0.5681818181818182, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.11517165005537099, 0.5681818181818182, 'MntCoffee <= -0.274\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = High'), Text(0.10631229235880399, 0.5227272727272727, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.12403100775193798, 0.5227272727272727, 'MntCoffee <= -0.142\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.11517165005537099, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.132890365448505, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.11517165005537099, 0.6590909090909091, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'), Text(0.0753045404208195, 0.7954545454545454, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.16832779623477298, 0.8409090909090909, 'MntMeatProducts <= -0.498\ngini = 0.484\nsamples = 39\nvalue = [23, 16]\nclass = Low'), Text(0.15946843853820597, 0.7954545454545454, 'Income <= -0.104\ngini = 0.451\nsamples = 35\nvalue = [23, 12]\nclass = Low'), Text(0.132890365448505, 0.75, 'NumWebVisitsMonth <= -1.45\ngini = 0.255\nsamples = 20\nvalue = [17, 3]\nclass = Low'), Text(0.12403100775193798, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.14174972314507198, 0.7045454545454546, 'MntGoldProds <= 0.14\ngini = 0.188\nsamples = 19\nvalue = [17, 2]\nclass = Low'), Text(0.132890365448505, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.150609080841639, 0.6590909090909091, 'MntFishProducts <= -0.019\ngini = 0.105\nsamples = 18\nvalue = [17, 1]\nclass = Low'), Text(0.14174972314507198, 0.6136363636363636, 'gini = 0.0\nsamples = 16\nvalue = [16, 0]\nclass = Low'), Text(0.15946843853820597, 0.6136363636363636, 'Income <= -0.451\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.150609080841639, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.16832779623477298, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.18604651162790697, 0.75, 'MntFruits <= -0.184\ngini = 0.48\nsamples = 15\nvalue = [6, 9]\nclass = High'), Text(0.17718715393134, 0.7045454545454546, 'MntFishProducts <= -0.586\ngini = 0.496\nsamples = 11\nvalue = [6, 5]\nclass = Low'), Text(0.16832779623477298, 0.6590909090909091, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.18604651162790697, 0.6590909090909091, 'Income <= -0.052\ngini = 0.375\nsamples = 8\nvalue = [6, 2]\nclass = Low'), Text(0.17718715393134, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.19490586932447398, 0.6136363636363636, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = Low'), Text(0.19490586932447398, 0.7045454545454546, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.17718715393134, 0.7954545454545454, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.13510520487264674, 0.8863636363636364, 'gini = 0.0\nsamples = 45\nvalue = [0, 45]\nclass = High'), Text(0.40032681859080843, 0.9318181818181818, 'MntCoffee <= -0.539\ngini = 0.23\nsamples = 799\nvalue = [106, 693]\nclass = High'), Text(0.2702104097452935, 0.8863636363636364, 'MntGoldProds <= -0.797\ngini = 0.452\nsamples = 55\nvalue = [36, 19]\nclass = Low'), Text(0.26135105204872644, 0.8409090909090909, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = High'), Text(0.27906976744186046, 0.8409090909090909, 'MntSweetProducts <= 1.222\ngini = 0.403\nsamples = 50\nvalue = [36, 14]\nclass = Low'), Text(0.2702104097452935, 0.7954545454545454, 'Kidhome <= 1.963\ngini = 0.34\nsamples = 46\nvalue = [36, 10]\nclass = Low'), Text(0.26135105204872644, 0.75, 'Income <= 2.77\ngini = 0.298\nsamples = 44\nvalue = [36, 8]\nclass = Low'), Text(0.25249169435215946, 0.7045454545454546, 'MntFishProducts <= 0.082\ngini = 0.273\nsamples = 43\nvalue = [36, 7]\nclass = Low'), Text(0.22148394241417496, 0.6590909090909091, 'MntCoffee <= -0.609\ngini = 0.128\nsamples = 29\nvalue = [27, 2]\nclass = Low'), Text(0.21262458471760798, 0.6136363636363636, 'gini = 0.0\nsamples = 20\nvalue = [20, 0]\nclass = Low'), Text(0.23034330011074197, 0.6136363636363636, 'MntCoffee <= -0.601\ngini = 0.346\nsamples = 9\nvalue = [7, 2]\nclass = Low'), Text(0.22148394241417496, 0.5681818181818182, 'MntSweetProducts <= -0.522\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = High'), Text(0.21262458471760798, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.23034330011074197, 0.5227272727272727, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.23920265780730898, 0.5681818181818182, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = Low'), Text(0.28349944629014395, 0.6590909090909091, 'MntFishProducts <= 0.32\ngini = 0.459\nsamples = 14\nvalue = [9, 5]\nclass = Low'), Text(0.26578073089701, 0.6136363636363636, 'MntGoldProds <= -0.758\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = High'), Text(0.25692137320044295, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.27464008859357697, 0.5681818181818182, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.301218161683278, 0.6136363636363636, 'MntMeatProducts <= -0.425\ngini = 0.198\nsamples = 9\nvalue = [8, 1]\nclass = Low'), Text(0.292358803986711, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.31007751937984496, 0.5681818181818182, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]\nclass = Low'), Text(0.2702104097452935, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.27906976744186046, 0.75, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.28792912513842744, 0.7954545454545454, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.5304432274363233, 0.8863636363636364, 'MntMeatProducts <= -0.379\ngini = 0.17\nsamples = 744\nvalue = [70, 674]\nclass = High'), Text(0.36766334440753046, 0.8409090909090909, 'MntCoffee <= -0.223\ngini = 0.398\nsamples = 62\nvalue = [17, 45]\nclass = High'), Text(0.327796234772979, 0.7954545454545454, 'Income <= 0.18\ngini = 0.49\nsamples = 21\nvalue = [12, 9]\nclass = Low'), Text(0.31007751937984496, 0.75, 'MntGoldProds <= 0.092\ngini = 0.391\nsamples = 15\nvalue = [11, 4]\nclass = Low'), Text(0.301218161683278, 0.7045454545454546, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = Low'), Text(0.31893687707641194, 0.7045454545454546, 'MntFishProducts <= -0.065\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = High'), Text(0.31007751937984496, 0.6590909090909091, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.327796234772979, 0.6590909090909091, 'MntFruits <= 0.168\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'), Text(0.31893687707641194, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.33665559246954596, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.34551495016611294, 0.75, 'MntMeatProducts <= -0.392\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = High'), Text(0.33665559246954596, 0.7045454545454546, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = High'), Text(0.35437430786268, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.40753045404208194, 0.7954545454545454, 'MntMeatProducts <= -0.401\ngini = 0.214\nsamples = 41\nvalue = [5, 36]\nclass = High'), Text(0.38095238095238093, 0.75, 'MntCoffee <= 1.135\ngini = 0.157\nsamples = 35\nvalue = [3, 32]\nclass = High'), Text(0.37209302325581395, 0.7045454545454546, 'gini = 0.0\nsamples = 24\nvalue = [0, 24]\nclass = High'), Text(0.38981173864894797, 0.7045454545454546, 'MntCoffee <= 1.495\ngini = 0.397\nsamples = 11\nvalue = [3, 8]\nclass = High'), Text(0.38095238095238093, 0.6590909090909091, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = Low'), Text(0.39867109634551495, 0.6590909090909091, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = High'), Text(0.43410852713178294, 0.75, 'MntCoffee <= -0.028\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = High'), Text(0.42524916943521596, 0.7045454545454546, 'MntGoldProds <= 0.053\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'), Text(0.416389811738649, 0.6590909090909091, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.43410852713178294, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.4429678848283499, 0.7045454545454546, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.6932231104651163, 0.8409090909090909, 'NumWebVisitsMonth <= -1.665\ngini = 0.143\nsamples = 682\nvalue = [53, 629]\nclass = High'), Text(0.5204872646733112, 0.7954545454545454, 'Income <= 0.553\ngini = 0.329\nsamples = 101\nvalue = [21, 80]\nclass = High'), Text(0.47840531561461797, 0.75, 'MntMeatProducts <= 0.902\ngini = 0.499\nsamples = 19\nvalue = [10, 9]\nclass = Low'), Text(0.46068660022148394, 0.7045454545454546, 'MntGoldProds <= 2.72\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = Low'), Text(0.45182724252491696, 0.6590909090909091, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = Low'), Text(0.4695459579180509, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.49612403100775193, 0.7045454545454546, 'MntCoffee <= -0.328\ngini = 0.198\nsamples = 9\nvalue = [1, 8]\nclass = High'), Text(0.48726467331118495, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.5049833887043189, 0.6590909090909091, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = High'), Text(0.5625692137320044, 0.75, 'MntCoffee <= 0.736\ngini = 0.232\nsamples = 82\nvalue = [11, 71]\nclass = High'), Text(0.53156146179402, 0.7045454545454546, 'MntMeatProducts <= 1.79\ngini = 0.081\nsamples = 47\nvalue = [2, 45]\nclass = High'), Text(0.5227021040974529, 0.6590909090909091, 'gini = 0.0\nsamples = 42\nvalue = [0, 42]\nclass = High'), Text(0.540420819490587, 0.6590909090909091, 'MntMeatProducts <= 2.054\ngini = 0.48\nsamples = 5\nvalue = [2, 3]\nclass = High'), Text(0.53156146179402, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.5492801771871539, 0.6136363636363636, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.593576965669989, 0.7045454545454546, 'MntMeatProducts <= 0.601\ngini = 0.382\nsamples = 35\nvalue = [9, 26]\nclass = High'), Text(0.5758582502768549, 0.6590909090909091, 'MntMeatProducts <= -0.197\ngini = 0.444\nsamples = 6\nvalue = [4, 2]\nclass = Low'), Text(0.5669988925802879, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.584717607973422, 0.6136363636363636, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'), Text(0.6112956810631229, 0.6590909090909091, 'MntMeatProducts <= 1.241\ngini = 0.285\nsamples = 29\nvalue = [5, 24]\nclass = High'), Text(0.602436323366556, 0.6136363636363636, 'gini = 0.0\nsamples = 14\nvalue = [0, 14]\nclass = High'), Text(0.6201550387596899, 0.6136363636363636, 'MntFishProducts <= 1.171\ngini = 0.444\nsamples = 15\nvalue = [5, 10]\nclass = High'), Text(0.602436323366556, 0.5681818181818182, 'MntCoffee <= 0.88\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = High'), Text(0.593576965669989, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.6112956810631229, 0.5227272727272727, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = High'), Text(0.6378737541528239, 0.5681818181818182, 'MntFruits <= -0.158\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = Low'), Text(0.6290143964562569, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.646733111849391, 0.5227272727272727, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'), Text(0.8659589562569213, 0.7954545454545454, 'Income <= -0.887\ngini = 0.104\nsamples = 581\nvalue = [32, 549]\nclass = High'), Text(0.8194559800664452, 0.75, 'MntMeatProducts <= -0.186\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.8105966223698782, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.8283153377630121, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.9124619324473976, 0.75, 'MntFruits <= 4.205\ngini = 0.101\nsamples = 579\nvalue = [31, 548]\nclass = High'), Text(0.8460340531561462, 0.7045454545454546, 'MntSweetProducts <= -0.232\ngini = 0.099\nsamples = 577\nvalue = [30, 547]\nclass = High'), Text(0.7397563676633444, 0.6590909090909091, 'MntFruits <= 3.413\ngini = 0.159\nsamples = 172\nvalue = [15, 157]\nclass = High'), Text(0.7308970099667774, 0.6136363636363636, 'MntCoffee <= 3.385\ngini = 0.15\nsamples = 171\nvalue = [14, 157]\nclass = High'), Text(0.7220376522702104, 0.5681818181818182, 'MntCoffee <= -0.269\ngini = 0.141\nsamples = 170\nvalue = [13, 157]\nclass = High'), Text(0.6644518272425249, 0.5227272727272727, 'NumWebVisitsMonth <= -0.594\ngini = 0.358\nsamples = 30\nvalue = [7, 23]\nclass = High'), Text(0.655592469545958, 0.4772727272727273, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.6733111849390919, 0.4772727272727273, 'MntMeatProducts <= -0.248\ngini = 0.293\nsamples = 28\nvalue = [5, 23]\nclass = High'), Text(0.6644518272425249, 0.4318181818181818, 'MntCoffee <= -0.366\ngini = 0.496\nsamples = 11\nvalue = [5, 6]\nclass = High'), Text(0.646733111849391, 0.38636363636363635, 'MntMeatProducts <= -0.275\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = High'), Text(0.6378737541528239, 0.3409090909090909, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.655592469545958, 0.3409090909090909, 'NumWebVisitsMonth <= 0.905\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.646733111849391, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.6644518272425249, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.6821705426356589, 0.38636363636363635, 'MntFruits <= -0.636\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = Low'), Text(0.6733111849390919, 0.3409090909090909, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.6910299003322259, 0.3409090909090909, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'), Text(0.6821705426356589, 0.4318181818181818, 'gini = 0.0\nsamples = 17\nvalue = [0, 17]\nclass = High'), Text(0.7796234772978959, 0.5227272727272727, 'MntFruits <= 1.149\ngini = 0.082\nsamples = 140\nvalue = [6, 134]\nclass = High'), Text(0.7441860465116279, 0.4772727272727273, 'Income <= 0.845\ngini = 0.046\nsamples = 127\nvalue = [3, 124]\nclass = High'), Text(0.7264673311184939, 0.4318181818181818, 'MntGoldProds <= -0.671\ngini = 0.018\nsamples = 112\nvalue = [1, 111]\nclass = High'), Text(0.717607973421927, 0.38636363636363635, 'NumWebVisitsMonth <= -1.236\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = High'), Text(0.70874861572536, 0.3409090909090909, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.7264673311184939, 0.3409090909090909, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = High'), Text(0.7353266888150609, 0.38636363636363635, 'gini = 0.0\nsamples = 106\nvalue = [0, 106]\nclass = High'), Text(0.7619047619047619, 0.4318181818181818, 'Income <= 0.884\ngini = 0.231\nsamples = 15\nvalue = [2, 13]\nclass = High'), Text(0.7530454042081949, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.770764119601329, 0.38636363636363635, 'MntMeatProducts <= -0.159\ngini = 0.133\nsamples = 14\nvalue = [1, 13]\nclass = High'), Text(0.7619047619047619, 0.3409090909090909, 'UsedCampaignOffer <= 0.513\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.7530454042081949, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.770764119601329, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.7796234772978959, 0.3409090909090909, 'gini = 0.0\nsamples = 12\nvalue = [0, 12]\nclass = High'), Text(0.8150609080841639, 0.4772727272727273, 'MntMeatProducts <= 1.613\ngini = 0.355\nsamples = 13\nvalue = [3, 10]\nclass = High'), Text(0.7973421926910299, 0.4318181818181818, 'MntFruits <= 1.262\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = High'), Text(0.7884828349944629, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.8062015503875969, 0.38636363636363635, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = High'), Text(0.832779623477298, 0.4318181818181818, 'UsedCampaignOffer <= 0.513\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'), Text(0.8239202657807309, 0.38636363636363635, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.8416389811738649, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.7397563676633444, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.7486157253599114, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.9523117386489479, 0.6590909090909091, 'MntSweetProducts <= 3.415\ngini = 0.071\nsamples = 405\nvalue = [15, 390]\nclass = High'), Text(0.9242801771871539, 0.6136363636363636, 'MntFruits <= 1.111\ngini = 0.06\nsamples = 388\nvalue = [12, 376]\nclass = High'), Text(0.915420819490587, 0.5681818181818182, 'MntFruits <= 1.086\ngini = 0.087\nsamples = 262\nvalue = [12, 250]\nclass = High'), Text(0.90656146179402, 0.5227272727272727, 'MntCoffee <= -0.388\ngini = 0.081\nsamples = 261\nvalue = [11, 250]\nclass = High'), Text(0.8593576965669989, 0.4772727272727273, 'Income <= 0.256\ngini = 0.32\nsamples = 10\nvalue = [2, 8]\nclass = High'), Text(0.8504983388704319, 0.4318181818181818, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]\nclass = High'), Text(0.8682170542635659, 0.4318181818181818, 'MntCoffee <= -0.461\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'), Text(0.8593576965669989, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.8770764119601329, 0.38636363636363635, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.9537652270210409, 0.4772727272727273, 'MntGoldProds <= 3.0\ngini = 0.069\nsamples = 251\nvalue = [9, 242]\nclass = High'), Text(0.925249169435216, 0.4318181818181818, 'Income <= 1.507\ngini = 0.062\nsamples = 248\nvalue = [8, 240]\nclass = High'), Text(0.8947951273532669, 0.38636363636363635, 'MntFruits <= 0.86\ngini = 0.05\nsamples = 236\nvalue = [6, 230]\nclass = High'), Text(0.8604651162790697, 0.3409090909090909, 'MntCoffee <= 0.533\ngini = 0.035\nsamples = 222\nvalue = [4, 218]\nclass = High'), Text(0.8516057585825028, 0.29545454545454547, 'gini = 0.0\nsamples = 105\nvalue = [0, 105]\nclass = High'), Text(0.8693244739756367, 0.29545454545454547, 'MntCoffee <= 0.546\ngini = 0.066\nsamples = 117\nvalue = [4, 113]\nclass = High'), Text(0.8449612403100775, 0.25, 'MntMeatProducts <= 0.714\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.8361018826135105, 0.20454545454545456, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.8538205980066446, 0.20454545454545456, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.893687707641196, 0.25, 'NumWebVisitsMonth <= 0.905\ngini = 0.051\nsamples = 115\nvalue = [3, 112]\nclass = High'), Text(0.8715393133997785, 0.20454545454545456, 'NumWebVisitsMonth <= -0.808\ngini = 0.037\nsamples = 107\nvalue = [2, 105]\nclass = High'), Text(0.8626799557032115, 0.1590909090909091, 'MntCoffee <= 0.726\ngini = 0.087\nsamples = 44\nvalue = [2, 42]\nclass = High'), Text(0.8449612403100775, 0.11363636363636363, 'MntCoffee <= 0.709\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = High'), Text(0.8361018826135105, 0.06818181818181818, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.8538205980066446, 0.06818181818181818, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.8803986710963455, 0.11363636363636363, 'MntSweetProducts <= 0.059\ngini = 0.049\nsamples = 40\nvalue = [1, 39]\nclass = High'), Text(0.8715393133997785, 0.06818181818181818, 'MntGoldProds <= -0.072\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = High'), Text(0.8626799557032115, 0.022727272727272728, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.8803986710963455, 0.022727272727272728, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.8892580287929125, 0.06818181818181818, 'gini = 0.0\nsamples = 35\nvalue = [0, 35]\nclass = High'), Text(0.8803986710963455, 0.1590909090909091, 'gini = 0.0\nsamples = 63\nvalue = [0, 63]\nclass = High'), Text(0.9158361018826136, 0.20454545454545456, 'MntCoffee <= 1.174\ngini = 0.219\nsamples = 8\nvalue = [1, 7]\nclass = High'), Text(0.9069767441860465, 0.1590909090909091, 'Income <= 0.11\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.8981173864894795, 0.11363636363636363, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.9158361018826136, 0.11363636363636363, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.9246954595791805, 0.1590909090909091, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = High'), Text(0.929125138427464, 0.3409090909090909, 'MntCoffee <= 0.357\ngini = 0.245\nsamples = 14\nvalue = [2, 12]\nclass = High'), Text(0.920265780730897, 0.29545454545454547, 'MntFishProducts <= 1.949\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = High'), Text(0.9114064230343301, 0.25, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.929125138427464, 0.25, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.937984496124031, 0.29545454545454547, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = High'), Text(0.955703211517165, 0.38636363636363635, 'Income <= 1.551\ngini = 0.278\nsamples = 12\nvalue = [2, 10]\nclass = High'), Text(0.946843853820598, 0.3409090909090909, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.964562569213732, 0.3409090909090909, 'MntFishProducts <= -0.477\ngini = 0.165\nsamples = 11\nvalue = [1, 10]\nclass = High'), Text(0.955703211517165, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.973421926910299, 0.29545454545454547, 'gini = 0.0\nsamples = 10\nvalue = [0, 10]\nclass = High'), Text(0.982281284606866, 0.4318181818181818, 'MntCoffee <= 0.14\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = High'), Text(0.973421926910299, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.991140642303433, 0.38636363636363635, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.9242801771871539, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.9331395348837209, 0.5681818181818182, 'gini = 0.0\nsamples = 126\nvalue = [0, 126]\nclass = High'), Text(0.980343300110742, 0.6136363636363636, 'Income <= 0.691\ngini = 0.291\nsamples = 17\nvalue = [3, 14]\nclass = High'), Text(0.971483942414175, 0.5681818181818182, 'MntFruits <= 0.244\ngini = 0.49\nsamples = 7\nvalue = [3, 4]\nclass = High'), Text(0.9626245847176079, 0.5227272727272727, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.980343300110742, 0.5227272727272727, 'MntSweetProducts <= 3.827\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = Low'), Text(0.971483942414175, 0.4772727272727273, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = Low'), Text(0.989202657807309, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.989202657807309, 0.5681818181818182, 'gini = 0.0\nsamples = 10\nvalue = [0, 10]\nclass = High'), Text(0.978889811738649, 0.7045454545454546, 'MntGoldProds <= 1.213\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.970030454042082, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.987749169435216, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High')]
class RandForestClassifier:
def __init__(self, x_train, y_train, x_test, y_test, n_estimators: int = 100, max_depth: int = 8):
self.x_train = x_train
self.y_train = y_train
self.x_test = x_test
self.y_test = y_test
self.n_estimators = n_estimators
self.max_depth = max_depth
self.set_params()
def set_params(self):
self.randf = RandomForestClassifier(criterion='entropy',
max_depth=self.max_depth,
n_estimators=self.n_estimators,
random_state=1)
self.randf.fit(self.x_train, self.y_train)
self.randf_predict = self.randf.predict(self.x_test)
def accuracy_test(self) -> float:
return metrics.accuracy_score(self.y_test, self.randf_predict)
def accuracy_train(self) -> float:
train_predict = self.randf.predict(self.x_train)
return metrics.accuracy_score(self.y_train, train_predict)
def confusion_matrix(self):
matrix_randf = metrics.confusion_matrix(self.y_test, self.randf_predict)
matrix_disp = metrics.ConfusionMatrixDisplay(matrix_randf)
matrix_disp.plot(cmap='Blues')
plt.grid(False)
plt.title('Random Forest Confusion Matrix')
plt.show()
def grid_search(self) -> tuple[float, GridSearchCV]:
search_params = {
'n_estimators': range(70, 230, 30),
'criterion': ['entropy'],
'max_depth': range(4, 10),
'random_state': [1]
}
grid = GridSearchCV(self.randf, search_params, scoring='accuracy', n_jobs=2)
grid.fit(self.x_train, self.y_train)
test_score = grid.score(self.x_test, self.y_test)
return test_score, grid
randf_model = RandForestClassifier(x_train, y_train, x_test, y_test)
print(f'Random Forest: {randf_model.accuracy_train() * 100:.3f}%')
print(f'Random Forest: {randf_model.accuracy_test() * 100:.3f}%')
randf_model.confusion_matrix()
grid_res = randf_model.grid_search()
print(f'Random Forest\nTest Score: {grid_res[0]}\nParams: {grid_res[1].best_params_}')
Random Forest: 96.556% Random Forest: 92.560%
Random Forest
Test Score: 0.9241071428571429
Params: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 100, 'random_state': 1}
def n_estimators_effects():
n_estimators = range(1, 200, 10)
train_res = []
test_res = []
for est_count in n_estimators:
randf = RandForestClassifier(x_train, y_train, x_test, y_test, n_estimators=est_count)
test_res.append(randf.accuracy_test())
train_res.append(randf.accuracy_train())
plt.plot(n_estimators, test_res)
plt.plot(n_estimators, train_res)
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.legend(loc="lower right")
def max_depth_effects():
max_depths = range(1, 20)
train_res = []
test_res = []
for depth in max_depths:
randf = RandForestClassifier(x_train, y_train, x_test, y_test, max_depth=depth)
test_res.append(randf.accuracy_test())
train_res.append(randf.accuracy_train())
plt.plot(max_depths, test_res)
plt.plot(max_depths, train_res)
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
plt.legend(loc="lower right")
plt.figure(figsize=(10, 4))
plt.suptitle('Hyperparameter Effects on Accuracy on Random Forest Classifier')
plt.subplot(1, 2, 1)
n_estimators_effects()
plt.subplot(1, 2, 2)
max_depth_effects()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
dtree = DecisionTreeClassifier(** dtree_params)
_, dtree_bias, dtree_var = bias_variance_decomp(
dtree,
x_train.values, y_train.values,
x_test.values, y_test.values,
loss='mse', random_seed=10
)
_, randf_bias, randf_var = bias_variance_decomp(randf_model.randf,
x_train.values, y_train.values,
x_test.values, y_test.values,
loss='mse', random_seed=1)
display(HTML(f'<b>Decision Tree Bias:</b> {dtree_bias:.3f}'))
display(HTML(f'<b>Decision Tree Variance:</b> {dtree_var:.3f}'))
display(HTML(f'<b>Random Forest Bias:</b> {randf_bias:.3f}'))
display(HTML(f'<b>Random Forest Variance:</b> {randf_var:.3f}'))
According to the formula, Bias is the difference between the average prediction of our model and the correct value which we are trying to predict. A high bias can cause an algorithm to miss the relevant relations between features and target outputs (underfitting).
Variance is the variability of model prediction for a given data point or a value which indicates the spread of the data.
High variance can result in overfitting, which means the model is capturing the random noise in the training data instead of the desired outputs.
Random forests are an ensemble method that combines multiple decision trees to reduce variance and improve generalization but Decision Trees have more varaince and less bias because they tend to overfit with training data.
plot_tree(dtree) #This is best dtree with best_params
[Text(0.41304347826086957, 0.9, 'x[4] <= -0.458\nentropy = 0.999\nsamples = 1568\nvalue = [812, 756]'), Text(0.17391304347826086, 0.7, 'x[2] <= -0.414\nentropy = 0.474\nsamples = 798\nvalue = [717, 81]'), Text(0.08695652173913043, 0.5, 'x[7] <= 0.421\nentropy = 0.032\nsamples = 608\nvalue = [606, 2]'), Text(0.043478260869565216, 0.3, 'entropy = 0.0\nsamples = 600\nvalue = [600, 0]'), Text(0.13043478260869565, 0.3, 'entropy = 0.811\nsamples = 8\nvalue = [6, 2]'), Text(0.2608695652173913, 0.5, 'x[2] <= 0.06\nentropy = 0.979\nsamples = 190\nvalue = [111, 79]'), Text(0.21739130434782608, 0.3, 'x[4] <= -0.625\nentropy = 0.803\nsamples = 147\nvalue = [111, 36]'), Text(0.17391304347826086, 0.1, 'entropy = 0.116\nsamples = 64\nvalue = [63, 1]'), Text(0.2608695652173913, 0.1, 'entropy = 0.982\nsamples = 83\nvalue = [48, 35]'), Text(0.30434782608695654, 0.3, 'entropy = 0.0\nsamples = 43\nvalue = [0, 43]'), Text(0.6521739130434783, 0.7, 'x[2] <= -0.362\nentropy = 0.539\nsamples = 770\nvalue = [95, 675]'), Text(0.4782608695652174, 0.5, 'x[4] <= -0.368\nentropy = 0.997\nsamples = 103\nvalue = [48, 55]'), Text(0.391304347826087, 0.3, 'x[2] <= -0.512\nentropy = 0.746\nsamples = 33\nvalue = [26, 7]'), Text(0.34782608695652173, 0.1, 'entropy = 0.439\nsamples = 22\nvalue = [20, 2]'), Text(0.43478260869565216, 0.1, 'entropy = 0.994\nsamples = 11\nvalue = [6, 5]'), Text(0.5652173913043478, 0.3, 'x[6] <= 0.859\nentropy = 0.898\nsamples = 70\nvalue = [22, 48]'), Text(0.5217391304347826, 0.1, 'entropy = 0.958\nsamples = 58\nvalue = [22, 36]'), Text(0.6086956521739131, 0.1, 'entropy = 0.0\nsamples = 12\nvalue = [0, 12]'), Text(0.8260869565217391, 0.5, 'x[8] <= -1.236\nentropy = 0.368\nsamples = 667\nvalue = [47, 620]'), Text(0.7391304347826086, 0.3, 'x[6] <= 0.217\nentropy = 0.577\nsamples = 211\nvalue = [29, 182]'), Text(0.6956521739130435, 0.1, 'entropy = 0.908\nsamples = 65\nvalue = [21, 44]'), Text(0.782608695652174, 0.1, 'entropy = 0.306\nsamples = 146\nvalue = [8, 138]'), Text(0.9130434782608695, 0.3, 'x[4] <= -0.166\nentropy = 0.24\nsamples = 456\nvalue = [18, 438]'), Text(0.8695652173913043, 0.1, 'entropy = 0.506\nsamples = 125\nvalue = [14, 111]'), Text(0.9565217391304348, 0.1, 'entropy = 0.094\nsamples = 331\nvalue = [4, 327]')]
Adding noise to a dataset is a technique used for data anonymization, enhancing privacy and security. It makes it difficult to identify individuals in the dataset and mitigates risks of re-identification and inference attacks. However, it may reduce data quality and utility. The impact of noise addition varies based on the specific tasks and applications for which the data is used. If noise addition follows a differential privacy framework, it provides a formal guarantee of privacy protection. Balancing privacy preservation and data utility is crucial. Adherence to privacy-preserving frameworks like differential privacy can provide formal guarantees of individual privacy protection.
Pulsed Noise and Gaussian Noise are two common methods for adding noise to data.
def noise_gen(data, sensitivity, eps=1.0):
return np.random.laplace(0, scale=sensitivity / eps, size=data.shape)
def add_noise():
nx_train=[]
nx_test=[]
sensitivity_range = [0.1, 0.5, 1.0, 2.0, 2.5, 3]
for sensitivity in sensitivity_range:
nx_train.append(x_train + noise_gen(x_train, sensitivity))
nx_test.append(x_test + noise_gen(x_test, sensitivity))
return nx_train, nx_test
def calculate_accuracy(class_type, sensitivity_range, DT_grid_search_params, noisy_X_trains, y_train, noisy_X_tests, y_test):
scores = [0] * len(sensitivity_range)
for i in range(len(sensitivity_range)):
DT = Classifier(class_type, noisy_X_trains[i], y_train, noisy_X_tests[i], y_test)
scores[i] = DT.accuracy_test()
return scores
def plot_accuracy_vs_sensitivity(sensitivity_range, scores):
plt.plot(sensitivity_range, scores)
plt.xlabel('sensitivity')
plt.ylabel('Accuracy')
plt.title('Decision Tree Accuracy with noise')
plt.show()
nx_train, nx_test = add_noise()
sensitivities = [x / 10.0 for x in range(1, 30, 1)]
scores = calculate_accuracy(DecisionTreeClassifier, sensitivity_range, dtree_params, nx_train, y_train, nx_test, y_test)
plot_accuracy_vs_sensitivity(sensitivity_range, scores)
sensitivities = [x / 10.0 for x in range(1, 30, 1)]
scores = calculate_accuracy(KNeighborsClassifier, sensitivity_range, knear_params, nx_train, y_train, nx_test, y_test)
plot_accuracy_vs_sensitivity(sensitivity_range, scores)
sensitivities = [x / 10.0 for x in range(1, 30, 1)]
scores = calculate_accuracy(LogisticRegression, sensitivity_range, knear_params, nx_train, y_train, nx_test, y_test)
plot_accuracy_vs_sensitivity(sensitivity_range, scores)
noisy_dtree_model = Classifier(DecisionTreeClassifier, nx_train[3], y_train, nx_test[3], y_test)
noisy_knear_model = Classifier(KNeighborsClassifier, nx_train[3], y_train, nx_test[3], y_test)
noisy_logreg_model = Classifier(LogisticRegression, nx_train[3], y_train, nx_test[3], y_test)
# Print accuracy on the noisy dataset
print(f'Noisy Decision Tree: {noisy_dtree_model.accuracy_test() * 100:.3f}%')
print(f'Noisy K Nearest Neighbors: {noisy_knear_model.accuracy_test() * 100:.3f}%')
print(f'Noisy Logistic Regression: {noisy_logreg_model.accuracy_test() * 100:.3f}%')
Noisy Decision Tree: 58.482% Noisy K Nearest Neighbors: 66.220% Noisy Logistic Regression: 69.494%
# Compare confusion matrices
noisy_dtree_model.confusion_matrix()
noisy_knear_model.confusion_matrix()
noisy_logreg_model.confusion_matrix()
Gradient Boosting: It builds an ensemble of decision trees to make predictions. It works by iteratively adding decision trees to the ensemble, with each new tree attempting to correct the errors made by the previous trees.
The algorithm starts by building a single decision tree on the training data. It then uses this tree to make predictions on the training data, and calculates the errors between the predicted values and the actual values. These errors are then used to train a second decision tree, which attempts to correct the errors made by the first tree. This process is repeated for a specified number of iterations, with each new tree attempting to correct the errors made by the previous trees.
During each iteration, the algorithm calculates the gradient of the loss function with respect to the predictions made by the current ensemble. This gradient is then used to fit a new decision tree to the negative gradient of the loss function, which is the residual error that the current ensemble is unable to explain. The new tree is then added to the ensemble, and the process is repeated until the specified number of iterations is reached.
The final prediction of the ensemble is the sum of the predictions made by all the individual trees in the ensemble. The algorithm uses a learning rate parameter to control the contribution of each new tree to the final prediction. A smaller learning rate will result in a more conservative model, while a larger learning rate will result in a more aggressive model.
Differnces:
The main difference between Gradient Boosting and Decision Trees is that Gradient Boosting is an ensemble method that combines multiple decision trees to make predictions, while Decision Trees are standalone models that make predictions based on a single tree.
Decision Trees are a simple and interpretable model that can be used for both regression and classification tasks. They work by recursively splitting the data into subsets based on the values of the input variables, until a stopping criterion is met. The resulting tree structure can be visualized and interpreted, making it easy to understand how the model is making predictions.
Gradient Boosting, on the other hand, is a more complex and powerful algorithm that can be used for both regression and classification tasks. It works by iteratively adding decision trees to the ensemble, with each new tree attempting to correct the errors made by the previous trees. The resulting ensemble of trees can be more accurate than a single decision tree, but it is also more complex and harder to interpret.
Another difference between Gradient Boosting and Decision Trees is that Gradient Boosting can handle non-linear relationships between the input variables and the output variable, while Decision Trees are limited to linear relationships. Gradient Boosting can also handle missing data and outliers more effectively than Decision Trees, as it is less prone to overfitting.
Gradient boosting is an ensemble learning technique used for both regression and classification problems. It builds a series of weak learners, typically decision trees, sequentially. Each new tree corrects the errors of the combined ensemble of the existing trees. The predictions of each tree are weighted based on their performance, with more accurate trees receiving higher weights.
Working Process: Initial Model: Start with a simple model, often a shallow decision tree. Residuals Calculation: Calculate the residuals (differences between predictions and actual values) for the current model. Next Model: Build a new weak learner (tree) that focuses on minimizing the residuals of the previous model. Weighted Combination: Combine the new model with the previous ones, giving more weight to accurate models and less weight to less accurate ones. Iteration: Repeat steps 2-4 until a predefined number of models are created or until a specified level of performance is achieved.
Difference Between Boosting Trees and Decision Trees:
Boosting Trees:
Ensemble of Weak Models: Boosting trees involve creating an ensemble of weak models, usually shallow decision trees. Sequential Learning: Trees are built sequentially, with each tree aiming to correct the errors of the combined ensemble. Weighted Contributions: Each tree's contribution to the final prediction is weighted based on its accuracy.
Decision Trees:
Individual Strong Model: A single decision tree is a standalone model. Non-Sequential Learning: Decision trees are constructed independently without considering errors from previous models. Equal Contributions: Each tree has an equal contribution to the final prediction, and there is no weighting based on accuracy.
Summary:
Boosting trees are an ensemble method that combines weak models sequentially to improve accuracy. Decision trees, when used individually, are standalone models without the sequential learning and weighted contributions characteristic of boosting trees.
In summary, boosting-gradient is a powerful technique that iteratively builds weak learners, often decision trees, with each new model aiming to correct the errors of the combined ensemble. This sequential learning and weighted combination process allows boosting to achieve high predictive accuracy.
XGBoost (Extreme Gradient Boosting) is a popular implementation of the Gradient Boosting algorithm that is designed to be highly scalable and efficient. The XGBoost algorithm uses a technique called gradient boosting to minimize a loss function, which is a measure of how well the model is able to predict the target variable. One of the key features of XGBoost is its ability to handle missing data and outliers. It does this by using a technique called regularization, which penalizes complex models and encourages simpler models that are less likely to overfit the data. XGBoost also uses a technique called pruning to remove branches of the decision tree that do not contribute to the overall accuracy of the model.
Another important feature of XGBoost is its ability to handle both sparse and dense data. It does this by using a technique called sparsity-aware split finding, which is able to handle missing values and zero values in sparse data more efficiently than traditional split finding algorithms.
XGBoost also includes a number of other features that make it a powerful and flexible algorithm for machine learning tasks. These include support for custom loss functions, early stopping to prevent overfitting, and the ability to handle multi-class classification problems.
Overall, XGBoost is a highly effective algorithm for a wide range of machine learning tasks, including regression, classification, and ranking. Its ability to handle missing data, outliers, and sparse data, as well as its support for custom loss functions and early stopping, make it a popular choice for many data scientists and machine learning practitioners.
Here are some of the most important hyperparameters for the XGBoost algorithm:
max_depth: The maximum depth of each decision tree in the ensemble. Increasing this value can make the model more complex and potentially more accurate, but may also increase the risk of overfitting.
learning_rate: The step size used to update the weights of the model during each iteration. A smaller learning rate can make the model more conservative and less prone to overfitting, but may also require more iterations to converge.
n_estimators: The number of decision trees in the ensemble. Increasing this value can make the model more accurate, but may also increase the risk of overfitting and make the model slower to train.
subsample: The fraction of the training data used to train each decision tree. Setting this value to less than 1.0 can make the model more robust to noise and reduce overfitting.
colsample_bytree: The fraction of the features used to train each decision tree. Setting this value to less than 1.0 can make the model more robust to noise and reduce overfitting.
gamma: The minimum reduction in the loss function required to make a split at a node. Increasing this value can make the model more conservative and less prone to overfitting.
reg_alpha: L1 regularization term on weights. Increasing this value can make the model more conservative and less prone to overfitting.
reg_lambda: L2 regularization term on weights. Increasing this value can make the model more conservative and less prone to overfitting.
xgb = XGBClassifier()
params = {
'max_depth': range(3,11,2),
'learning_rate': [0.01, 0.03, 0.09, 0.1],
'n_estimators': range(20,201,30),
'gamma': [0, 1, 10, 100]
# 'max_depth': [3, 5, 7],
# 'learning_rate': [0.1, 0.01],
# 'n_estimators': [50, 100, 200],
# # 'subsample': [0.5, 0.7, 1],
# # 'colsample_bytree': [0.7, 1.0],
# 'gamma': [0, 1, 5]
}
xgb_grid_search = GridSearchCV(estimator=xgb, param_grid=params, cv=5, scoring="accuracy")
xgb_grid_search.fit(x_train, y_train)
print(f"Best Accuracy: {xgb_grid_search.best_score_ * 100:2.2f}%")
print(f"Best Parameters: {xgb_grid_search.best_params_}")
print(f"Gradient-boosting Accuracy: {xgb_grid_search.score(x_test, y_test) * 100:2.2f}%")
Best Accuracy: 93.18%
Best Parameters: {'gamma': 1, 'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 200}
Gradient-boosting Accuracy: 92.56%
y_pred = xgb_grid_search.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['0', '1'], columns=['0', '1'])
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='g')
plt.title('XGBoost Confusion Matrix')
plt.show()